Bug Summary

File:lib/Target/X86/X86ISelLowering.cpp
Warning:line 6665, column 1
Potential leak of memory pointed to by 'LoadMask.X'

Annotated Source Code

/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
16#include "Utils/X86ShuffleDecode.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86ShuffleDecodeConstantPool.h"
23#include "X86TargetMachine.h"
24#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallBitVector.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/Statistic.h"
28#include "llvm/ADT/StringExtras.h"
29#include "llvm/ADT/StringSwitch.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/CodeGen/IntrinsicLowering.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstrBuilder.h"
35#include "llvm/CodeGen/MachineJumpTableInfo.h"
36#include "llvm/CodeGen/MachineModuleInfo.h"
37#include "llvm/CodeGen/MachineRegisterInfo.h"
38#include "llvm/CodeGen/TargetLowering.h"
39#include "llvm/CodeGen/WinEHFuncInfo.h"
40#include "llvm/IR/CallSite.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DerivedTypes.h"
44#include "llvm/IR/DiagnosticInfo.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
47#include "llvm/IR/GlobalVariable.h"
48#include "llvm/IR/Instructions.h"
49#include "llvm/IR/Intrinsics.h"
50#include "llvm/MC/MCAsmInfo.h"
51#include "llvm/MC/MCContext.h"
52#include "llvm/MC/MCExpr.h"
53#include "llvm/MC/MCSymbol.h"
54#include "llvm/Support/CommandLine.h"
55#include "llvm/Support/Debug.h"
56#include "llvm/Support/ErrorHandling.h"
57#include "llvm/Support/KnownBits.h"
58#include "llvm/Support/MathExtras.h"
59#include "llvm/Target/TargetOptions.h"
60#include <algorithm>
61#include <bitset>
62#include <cctype>
63#include <numeric>
64using namespace llvm;
65
66#define DEBUG_TYPE"x86-isel" "x86-isel"
67
68STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls", {0}, false}
;
69
70static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
74 cl::Hidden);
75
76static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89/// Call this when the user attempts to do something unsupported, like
90/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91/// report_fatal_error, so calling code should attempt to recover without
92/// crashing.
93static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94 const char *Msg) {
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
98}
99
100X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107
108 // Set up the TargetLowering object.
109
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
122 else
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
126
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
133 }
134
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
148 }
149
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
158 } else {
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
161 }
162
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
169
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
172
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
180
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
182
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
190
191 // Integer absolute.
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
197 }
198
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
200 // operation.
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
204
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
209 else
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
219 }
220
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
222 // this operation.
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
225
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
232 } else {
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 }
236 } else {
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
239 }
240
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
242 // this operation.
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
245
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
251
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
256 } else {
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 }
260 } else {
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
264 }
265
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
267 // conversion.
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
271
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
277 } else {
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
280 }
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
288 else
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
293
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
295 }
296
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
305 }
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
308
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
313 //
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
326 }
327
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
334 }
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
341
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
346
347 // Promote the i8 variants and force them on up to i32 which has a shorter
348 // encoding.
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
359 }
360 }
361
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
364 // encoding.
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
367 } else {
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
377 }
378 }
379
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
386 }
387
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
393
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
400
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
403 } else {
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
409 }
410
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
412
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
415
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
422 }
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
425 continue;
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
428 }
429
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
433
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
442
443 // Darwin ABI issue.
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
446 continue;
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
453 }
454
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
462 }
463
464 if (Subtarget.hasSSE1())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
466
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
468
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
478 }
479
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
482 }
483
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
489 }
490
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
493
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
496
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
499
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
506
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
509
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
511
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
515
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
523
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
527
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
530
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
533
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
538 }
539
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
543
544 // Expand FP immediates into loads from the stack, except for the special
545 // cases we handle.
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
556
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
559
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
561
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
565
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
570
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
577
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
582 } else if (UseX87) {
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
587
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
591
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
596 }
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
605 }
606
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
610
611 // Long double always uses X87, except f128 in MMX.
612 if (UseX87) {
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
619 }
620
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
624 {
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
627 TmpFlt.changeSign();
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
629
630 bool ignored;
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
633 &ignored);
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
637 }
638
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
643
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
650 }
651
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
656
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
664
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
679 }
680
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
724
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
727
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
731 // them legal.
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
734
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
739 }
740 }
741
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
747 }
748
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
762 }
763
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
767
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
792
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
797
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
801
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
806 }
807
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
814 }
815
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
830 }
831
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
836
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
838 continue;
839
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
842 }
843
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
851 }
852
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
856
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
859
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
862
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
864
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
867
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
870
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
873
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
877
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
881
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
888 }
889 }
890
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
900 }
901
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
909 }
910
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
919
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
922
923 // We directly match byte blends in the backend as they match the VSELECT
924 // condition form.
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
926
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
932 }
933
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
938 }
939
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
949 }
950
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
954 }
955
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
960
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
964
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
968 }
969
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
972
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
985
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
995 }
996
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1000 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1002
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1005
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1015 }
1016
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1020
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1025 }
1026
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1031
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1037 }
1038
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1043 }
1044
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1048 }
1049
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1054
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1057
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1062
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1069 }
1070
1071 if (HasInt256) {
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1075
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1079
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1088 }
1089 }
1090
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1095 }
1096
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1102 }
1103
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1115 }
1116
1117 if (HasInt256)
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1119
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1127 }
1128
1129 if (HasInt256) {
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1133
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1137 }
1138 }
1139
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1145
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1148 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1149
1150 for (MVT VT : MVT::fp_vector_valuetypes())
1151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1152
1153 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1154 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1155 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1156 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1157 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1158 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1159 }
1160
1161 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1162 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1163 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1164 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1165 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1166 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1167 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1168 setTruncStoreAction(VT, MaskVT, Custom);
1169 }
1170
1171 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1172 setOperationAction(ISD::FNEG, VT, Custom);
1173 setOperationAction(ISD::FABS, VT, Custom);
1174 setOperationAction(ISD::FMA, VT, Legal);
1175 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1176 }
1177
1178 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1179 setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);
1180 setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);
1181 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1182 setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);
1183 setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);
1184 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1185 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1186 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1187 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1190 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1191 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1193 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1194 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1195 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1196 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1197 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1198 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1199
1200 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1201 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1203 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1204 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1205 if (Subtarget.hasVLX()){
1206 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1207 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1209 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1210 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1211
1212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1215 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1216 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1217 } else {
1218 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1219 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1220 setOperationAction(ISD::MLOAD, VT, Custom);
1221 setOperationAction(ISD::MSTORE, VT, Custom);
1222 }
1223 }
1224
1225 if (Subtarget.hasDQI()) {
1226 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1227 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1228 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1229 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1230 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1231 }
1232 if (Subtarget.hasVLX()) {
1233 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1234 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1235 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1236 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1237 }
1238 }
1239 if (Subtarget.hasVLX()) {
1240 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1241 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1242 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1243 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1244 }
1245
1246 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1250 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1251 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1252 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1254
1255 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1256 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
1257 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1258 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
1259 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1260 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1261
1262 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1263 setOperationAction(ISD::FFLOOR, VT, Legal);
1264 setOperationAction(ISD::FCEIL, VT, Legal);
1265 setOperationAction(ISD::FTRUNC, VT, Legal);
1266 setOperationAction(ISD::FRINT, VT, Legal);
1267 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1268 }
1269
1270 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1271 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1272
1273 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1274 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1275 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1276
1277 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1280 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1282
1283 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1284 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1285
1286 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1287 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1288
1289 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1290 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1291 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1292 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1293 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1294 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1295
1296
1297 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1298 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1299 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1300
1301 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1302 setOperationAction(ISD::ADD, VT, Custom);
1303 setOperationAction(ISD::SUB, VT, Custom);
1304 setOperationAction(ISD::MUL, VT, Custom);
1305 setOperationAction(ISD::SETCC, VT, Custom);
1306 setOperationAction(ISD::SELECT, VT, Custom);
1307 setOperationAction(ISD::TRUNCATE, VT, Custom);
1308
1309 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1311 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1312 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1313 setOperationAction(ISD::VSELECT, VT, Expand);
1314 }
1315
1316 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1317 setOperationAction(ISD::SMAX, VT, Legal);
1318 setOperationAction(ISD::UMAX, VT, Legal);
1319 setOperationAction(ISD::SMIN, VT, Legal);
1320 setOperationAction(ISD::UMIN, VT, Legal);
1321 setOperationAction(ISD::ABS, VT, Legal);
1322 setOperationAction(ISD::SRL, VT, Custom);
1323 setOperationAction(ISD::SHL, VT, Custom);
1324 setOperationAction(ISD::SRA, VT, Custom);
1325 setOperationAction(ISD::CTPOP, VT, Custom);
1326 setOperationAction(ISD::CTTZ, VT, Custom);
1327 }
1328
1329 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1330 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1331 setOperationAction(ISD::SMAX, VT, Legal);
1332 setOperationAction(ISD::UMAX, VT, Legal);
1333 setOperationAction(ISD::SMIN, VT, Legal);
1334 setOperationAction(ISD::UMIN, VT, Legal);
1335 }
1336
1337 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1338 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1339 MVT::v8i64}) {
1340 setOperationAction(ISD::ROTL, VT, Custom);
1341 setOperationAction(ISD::ROTR, VT, Custom);
1342 }
1343
1344 // Need to promote to 64-bit even though we have 32-bit masked instructions
1345 // because the IR optimizers rearrange bitcasts around logic ops leaving
1346 // too many variations to handle if we don't promote them.
1347 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1348 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1349 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1350
1351 if (Subtarget.hasCDI()) {
1352 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1353 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1354 MVT::v4i64, MVT::v8i64}) {
1355 setOperationAction(ISD::CTLZ, VT, Legal);
1356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1357 }
1358 } // Subtarget.hasCDI()
1359
1360 if (Subtarget.hasDQI()) {
1361 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1362 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1363 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1364 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1365 }
1366
1367 if (Subtarget.hasVPOPCNTDQ()) {
1368 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1369 // version of popcntd/q.
1370 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1371 MVT::v4i32, MVT::v2i64})
1372 setOperationAction(ISD::CTPOP, VT, Legal);
1373 }
1374
1375 // Custom lower several nodes.
1376 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1377 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1378 setOperationAction(ISD::MSCATTER, VT, Custom);
1379
1380 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v1i1, Legal);
1381
1382 // Extract subvector is special because the value type
1383 // (result) is 256-bit but the source is 512-bit wide.
1384 // 128-bit was made Legal under AVX1.
1385 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1386 MVT::v8f32, MVT::v4f64 })
1387 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1388 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1389 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1390 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1391
1392 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1393 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1394 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1395 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1396 setOperationAction(ISD::VSELECT, VT, Custom);
1397 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1398 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1399 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1400 setOperationAction(ISD::MLOAD, VT, Legal);
1401 setOperationAction(ISD::MSTORE, VT, Legal);
1402 setOperationAction(ISD::MGATHER, VT, Custom);
1403 setOperationAction(ISD::MSCATTER, VT, Custom);
1404 }
1405 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1406 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1407 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1408 }
1409 }// has AVX-512
1410
1411 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1412 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1413 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1414
1415 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1416 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1417
1418 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1419 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1420 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1421 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1422 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1423 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1424
1425 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1426 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1427 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1428 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1429 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1430 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1431 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1432 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1433 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1434 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1435 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1436 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1437 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1441 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1442 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1444 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1445 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1446 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1447 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1448 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1450 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1451 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1452 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1453 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1454 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1455 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1456 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1457 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1458 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1459 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1460 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1461 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1462 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1463 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1464 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1465 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1466 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1467 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1468 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1469 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1470 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1471 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1472
1473 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1474
1475 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1476 if (Subtarget.hasVLX()) {
1477 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1478 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1479 }
1480
1481 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1482 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1483 setOperationAction(ISD::MLOAD, VT, Action);
1484 setOperationAction(ISD::MSTORE, VT, Action);
1485 }
1486
1487 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1488 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1489 setOperationAction(ISD::VSELECT, VT, Custom);
1490 setOperationAction(ISD::ABS, VT, Legal);
1491 setOperationAction(ISD::SRL, VT, Custom);
1492 setOperationAction(ISD::SHL, VT, Custom);
1493 setOperationAction(ISD::SRA, VT, Custom);
1494 setOperationAction(ISD::MLOAD, VT, Legal);
1495 setOperationAction(ISD::MSTORE, VT, Legal);
1496 setOperationAction(ISD::CTPOP, VT, Custom);
1497 setOperationAction(ISD::CTTZ, VT, Custom);
1498 setOperationAction(ISD::CTLZ, VT, Custom);
1499 setOperationAction(ISD::SMAX, VT, Legal);
1500 setOperationAction(ISD::UMAX, VT, Legal);
1501 setOperationAction(ISD::SMIN, VT, Legal);
1502 setOperationAction(ISD::UMIN, VT, Legal);
1503
1504 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1505 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1506 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1507 }
1508
1509 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1510 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1511 }
1512
1513 if (Subtarget.hasBITALG()) {
1514 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v32i8,
1515 MVT::v16i16, MVT::v16i8, MVT::v8i16 })
1516 setOperationAction(ISD::CTPOP, VT, Legal);
1517 }
1518 }
1519
1520 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1521 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1522 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1523
1524 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1525 setOperationAction(ISD::ADD, VT, Custom);
1526 setOperationAction(ISD::SUB, VT, Custom);
1527 setOperationAction(ISD::MUL, VT, Custom);
1528 setOperationAction(ISD::VSELECT, VT, Expand);
1529
1530 setOperationAction(ISD::TRUNCATE, VT, Custom);
1531 setOperationAction(ISD::SETCC, VT, Custom);
1532 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1533 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1534 setOperationAction(ISD::SELECT, VT, Custom);
1535 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1536 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1537 }
1538
1539 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1540 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1541 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1542 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1543 }
1544
1545 // We want to custom lower some of our intrinsics.
1546 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1547 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1548 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1549 if (!Subtarget.is64Bit()) {
1550 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1551 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1552 }
1553
1554 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1555 // handle type legalization for these operations here.
1556 //
1557 // FIXME: We really should do custom legalization for addition and
1558 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1559 // than generic legalization for 64-bit multiplication-with-overflow, though.
1560 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1561 if (VT == MVT::i64 && !Subtarget.is64Bit())
1562 continue;
1563 // Add/Sub/Mul with overflow operations are custom lowered.
1564 setOperationAction(ISD::SADDO, VT, Custom);
1565 setOperationAction(ISD::UADDO, VT, Custom);
1566 setOperationAction(ISD::SSUBO, VT, Custom);
1567 setOperationAction(ISD::USUBO, VT, Custom);
1568 setOperationAction(ISD::SMULO, VT, Custom);
1569 setOperationAction(ISD::UMULO, VT, Custom);
1570
1571 // Support carry in as value rather than glue.
1572 setOperationAction(ISD::ADDCARRY, VT, Custom);
1573 setOperationAction(ISD::SUBCARRY, VT, Custom);
1574 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1575 }
1576
1577 if (!Subtarget.is64Bit()) {
1578 // These libcalls are not available in 32-bit.
1579 setLibcallName(RTLIB::SHL_I128, nullptr);
1580 setLibcallName(RTLIB::SRL_I128, nullptr);
1581 setLibcallName(RTLIB::SRA_I128, nullptr);
1582 setLibcallName(RTLIB::MUL_I128, nullptr);
1583 }
1584
1585 // Combine sin / cos into one node or libcall if possible.
1586 if (Subtarget.hasSinCos()) {
1587 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1588 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1589 if (Subtarget.isTargetDarwin()) {
1590 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1591 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1592 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1593 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1594 }
1595 }
1596
1597 if (Subtarget.isTargetWin64()) {
1598 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1599 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1600 setOperationAction(ISD::SREM, MVT::i128, Custom);
1601 setOperationAction(ISD::UREM, MVT::i128, Custom);
1602 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1603 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1604 }
1605
1606 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1607 // is. We should promote the value to 64-bits to solve this.
1608 // This is what the CRT headers do - `fmodf` is an inline header
1609 // function casting to f64 and calling `fmod`.
1610 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1611 Subtarget.isTargetWindowsItanium()))
1612 for (ISD::NodeType Op :
1613 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1614 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1615 if (isOperationExpand(Op, MVT::f32))
1616 setOperationAction(Op, MVT::f32, Promote);
1617
1618 // We have target-specific dag combine patterns for the following nodes:
1619 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1620 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1621 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1622 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1623 setTargetDAGCombine(ISD::BITCAST);
1624 setTargetDAGCombine(ISD::VSELECT);
1625 setTargetDAGCombine(ISD::SELECT);
1626 setTargetDAGCombine(ISD::SHL);
1627 setTargetDAGCombine(ISD::SRA);
1628 setTargetDAGCombine(ISD::SRL);
1629 setTargetDAGCombine(ISD::OR);
1630 setTargetDAGCombine(ISD::AND);
1631 setTargetDAGCombine(ISD::ADD);
1632 setTargetDAGCombine(ISD::FADD);
1633 setTargetDAGCombine(ISD::FSUB);
1634 setTargetDAGCombine(ISD::FNEG);
1635 setTargetDAGCombine(ISD::FMA);
1636 setTargetDAGCombine(ISD::FMINNUM);
1637 setTargetDAGCombine(ISD::FMAXNUM);
1638 setTargetDAGCombine(ISD::SUB);
1639 setTargetDAGCombine(ISD::LOAD);
1640 setTargetDAGCombine(ISD::MLOAD);
1641 setTargetDAGCombine(ISD::STORE);
1642 setTargetDAGCombine(ISD::MSTORE);
1643 setTargetDAGCombine(ISD::TRUNCATE);
1644 setTargetDAGCombine(ISD::ZERO_EXTEND);
1645 setTargetDAGCombine(ISD::ANY_EXTEND);
1646 setTargetDAGCombine(ISD::SIGN_EXTEND);
1647 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1648 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1649 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1650 setTargetDAGCombine(ISD::SINT_TO_FP);
1651 setTargetDAGCombine(ISD::UINT_TO_FP);
1652 setTargetDAGCombine(ISD::SETCC);
1653 setTargetDAGCombine(ISD::MUL);
1654 setTargetDAGCombine(ISD::XOR);
1655 setTargetDAGCombine(ISD::MSCATTER);
1656 setTargetDAGCombine(ISD::MGATHER);
1657
1658 computeRegisterProperties(Subtarget.getRegisterInfo());
1659
1660 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1661 MaxStoresPerMemsetOptSize = 8;
1662 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1663 MaxStoresPerMemcpyOptSize = 4;
1664 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1665 MaxStoresPerMemmoveOptSize = 4;
1666
1667 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1668 // that needs to benchmarked and balanced with the potential use of vector
1669 // load/store types (PR33329, PR33914).
1670 MaxLoadsPerMemcmp = 2;
1671 MaxLoadsPerMemcmpOptSize = 2;
1672
1673 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1674 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1675
1676 // An out-of-order CPU can speculatively execute past a predictable branch,
1677 // but a conditional move could be stalled by an expensive earlier operation.
1678 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1679 EnableExtLdPromotion = true;
1680 setPrefFunctionAlignment(4); // 2^4 bytes.
1681
1682 verifyIntrinsicTables();
1683}
1684
1685// This has so far only been implemented for 64-bit MachO.
1686bool X86TargetLowering::useLoadStackGuardNode() const {
1687 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1688}
1689
1690bool X86TargetLowering::useStackGuardXorFP() const {
1691 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1692 return Subtarget.getTargetTriple().isOSMSVCRT();
1693}
1694
1695SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1696 const SDLoc &DL) const {
1697 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1698 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1699 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1700 return SDValue(Node, 0);
1701}
1702
1703TargetLoweringBase::LegalizeTypeAction
1704X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1705 if (ExperimentalVectorWideningLegalization &&
1706 VT.getVectorNumElements() != 1 &&
1707 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1708 return TypeWidenVector;
1709
1710 return TargetLoweringBase::getPreferredVectorAction(VT);
1711}
1712
1713EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1714 LLVMContext& Context,
1715 EVT VT) const {
1716 if (!VT.isVector())
1717 return MVT::i8;
1718
1719 if (Subtarget.hasAVX512()) {
1720 const unsigned NumElts = VT.getVectorNumElements();
1721
1722 // Figure out what this type will be legalized to.
1723 EVT LegalVT = VT;
1724 while (getTypeAction(Context, LegalVT) != TypeLegal)
1725 LegalVT = getTypeToTransformTo(Context, LegalVT);
1726
1727 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1728 if (LegalVT.getSimpleVT().is512BitVector())
1729 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1730
1731 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1732 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1733 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1734 // vXi16/vXi8.
1735 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1736 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1737 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1738 }
1739 }
1740
1741 return VT.changeVectorElementTypeToInteger();
1742}
1743
1744/// Helper for getByValTypeAlignment to determine
1745/// the desired ByVal argument alignment.
1746static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1747 if (MaxAlign == 16)
1748 return;
1749 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1750 if (VTy->getBitWidth() == 128)
1751 MaxAlign = 16;
1752 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1753 unsigned EltAlign = 0;
1754 getMaxByValAlign(ATy->getElementType(), EltAlign);
1755 if (EltAlign > MaxAlign)
1756 MaxAlign = EltAlign;
1757 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1758 for (auto *EltTy : STy->elements()) {
1759 unsigned EltAlign = 0;
1760 getMaxByValAlign(EltTy, EltAlign);
1761 if (EltAlign > MaxAlign)
1762 MaxAlign = EltAlign;
1763 if (MaxAlign == 16)
1764 break;
1765 }
1766 }
1767}
1768
1769/// Return the desired alignment for ByVal aggregate
1770/// function arguments in the caller parameter area. For X86, aggregates
1771/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1772/// are at 4-byte boundaries.
1773unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1774 const DataLayout &DL) const {
1775 if (Subtarget.is64Bit()) {
1776 // Max of 8 and alignment of type.
1777 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1778 if (TyAlign > 8)
1779 return TyAlign;
1780 return 8;
1781 }
1782
1783 unsigned Align = 4;
1784 if (Subtarget.hasSSE1())
1785 getMaxByValAlign(Ty, Align);
1786 return Align;
1787}
1788
1789/// Returns the target specific optimal type for load
1790/// and store operations as a result of memset, memcpy, and memmove
1791/// lowering. If DstAlign is zero that means it's safe to destination
1792/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1793/// means there isn't a need to check it against alignment requirement,
1794/// probably because the source does not need to be loaded. If 'IsMemset' is
1795/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1796/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1797/// source is constant so it does not need to be loaded.
1798/// It returns EVT::Other if the type should be determined using generic
1799/// target-independent logic.
1800EVT
1801X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1802 unsigned DstAlign, unsigned SrcAlign,
1803 bool IsMemset, bool ZeroMemset,
1804 bool MemcpyStrSrc,
1805 MachineFunction &MF) const {
1806 const Function *F = MF.getFunction();
1807 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1808 if (Size >= 16 &&
1809 (!Subtarget.isUnalignedMem16Slow() ||
1810 ((DstAlign == 0 || DstAlign >= 16) &&
1811 (SrcAlign == 0 || SrcAlign >= 16)))) {
1812 // FIXME: Check if unaligned 32-byte accesses are slow.
1813 if (Size >= 32 && Subtarget.hasAVX()) {
1814 // Although this isn't a well-supported type for AVX1, we'll let
1815 // legalization and shuffle lowering produce the optimal codegen. If we
1816 // choose an optimal type with a vector element larger than a byte,
1817 // getMemsetStores() may create an intermediate splat (using an integer
1818 // multiply) before we splat as a vector.
1819 return MVT::v32i8;
1820 }
1821 if (Subtarget.hasSSE2())
1822 return MVT::v16i8;
1823 // TODO: Can SSE1 handle a byte vector?
1824 if (Subtarget.hasSSE1())
1825 return MVT::v4f32;
1826 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1827 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1828 // Do not use f64 to lower memcpy if source is string constant. It's
1829 // better to use i32 to avoid the loads.
1830 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1831 // The gymnastics of splatting a byte value into an XMM register and then
1832 // only using 8-byte stores (because this is a CPU with slow unaligned
1833 // 16-byte accesses) makes that a loser.
1834 return MVT::f64;
1835 }
1836 }
1837 // This is a compromise. If we reach here, unaligned accesses may be slow on
1838 // this target. However, creating smaller, aligned accesses could be even
1839 // slower and would certainly be a lot more code.
1840 if (Subtarget.is64Bit() && Size >= 8)
1841 return MVT::i64;
1842 return MVT::i32;
1843}
1844
1845bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1846 if (VT == MVT::f32)
1847 return X86ScalarSSEf32;
1848 else if (VT == MVT::f64)
1849 return X86ScalarSSEf64;
1850 return true;
1851}
1852
1853bool
1854X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1855 unsigned,
1856 unsigned,
1857 bool *Fast) const {
1858 if (Fast) {
1859 switch (VT.getSizeInBits()) {
1860 default:
1861 // 8-byte and under are always assumed to be fast.
1862 *Fast = true;
1863 break;
1864 case 128:
1865 *Fast = !Subtarget.isUnalignedMem16Slow();
1866 break;
1867 case 256:
1868 *Fast = !Subtarget.isUnalignedMem32Slow();
1869 break;
1870 // TODO: What about AVX-512 (512-bit) accesses?
1871 }
1872 }
1873 // Misaligned accesses of any size are always allowed.
1874 return true;
1875}
1876
1877/// Return the entry encoding for a jump table in the
1878/// current function. The returned value is a member of the
1879/// MachineJumpTableInfo::JTEntryKind enum.
1880unsigned X86TargetLowering::getJumpTableEncoding() const {
1881 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1882 // symbol.
1883 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1884 return MachineJumpTableInfo::EK_Custom32;
1885
1886 // Otherwise, use the normal jump table encoding heuristics.
1887 return TargetLowering::getJumpTableEncoding();
1888}
1889
1890bool X86TargetLowering::useSoftFloat() const {
1891 return Subtarget.useSoftFloat();
1892}
1893
1894void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1895 ArgListTy &Args) const {
1896
1897 // Only relabel X86-32 for C / Stdcall CCs.
1898 if (Subtarget.is64Bit())
1899 return;
1900 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1901 return;
1902 unsigned ParamRegs = 0;
1903 if (auto *M = MF->getFunction()->getParent())
1904 ParamRegs = M->getNumberRegisterParameters();
1905
1906 // Mark the first N int arguments as having reg
1907 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1908 Type *T = Args[Idx].Ty;
1909 if (T->isPointerTy() || T->isIntegerTy())
1910 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1911 unsigned numRegs = 1;
1912 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1913 numRegs = 2;
1914 if (ParamRegs < numRegs)
1915 return;
1916 ParamRegs -= numRegs;
1917 Args[Idx].IsInReg = true;
1918 }
1919 }
1920}
1921
1922const MCExpr *
1923X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1924 const MachineBasicBlock *MBB,
1925 unsigned uid,MCContext &Ctx) const{
1926 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 1926, __extension__ __PRETTY_FUNCTION__))
;
1927 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1928 // entries.
1929 return MCSymbolRefExpr::create(MBB->getSymbol(),
1930 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1931}
1932
1933/// Returns relocation base for the given PIC jumptable.
1934SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1935 SelectionDAG &DAG) const {
1936 if (!Subtarget.is64Bit())
1937 // This doesn't have SDLoc associated with it, but is not really the
1938 // same as a Register.
1939 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1940 getPointerTy(DAG.getDataLayout()));
1941 return Table;
1942}
1943
1944/// This returns the relocation base for the given PIC jumptable,
1945/// the same as getPICJumpTableRelocBase, but as an MCExpr.
1946const MCExpr *X86TargetLowering::
1947getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1948 MCContext &Ctx) const {
1949 // X86-64 uses RIP relative addressing based on the jump table label.
1950 if (Subtarget.isPICStyleRIPRel())
1951 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1952
1953 // Otherwise, the reference is relative to the PIC base.
1954 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1955}
1956
1957std::pair<const TargetRegisterClass *, uint8_t>
1958X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1959 MVT VT) const {
1960 const TargetRegisterClass *RRC = nullptr;
1961 uint8_t Cost = 1;
1962 switch (VT.SimpleTy) {
1963 default:
1964 return TargetLowering::findRepresentativeClass(TRI, VT);
1965 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1966 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1967 break;
1968 case MVT::x86mmx:
1969 RRC = &X86::VR64RegClass;
1970 break;
1971 case MVT::f32: case MVT::f64:
1972 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1973 case MVT::v4f32: case MVT::v2f64:
1974 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1975 case MVT::v8f32: case MVT::v4f64:
1976 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1977 case MVT::v16f32: case MVT::v8f64:
1978 RRC = &X86::VR128XRegClass;
1979 break;
1980 }
1981 return std::make_pair(RRC, Cost);
1982}
1983
1984unsigned X86TargetLowering::getAddressSpace() const {
1985 if (Subtarget.is64Bit())
1986 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1987 return 256;
1988}
1989
1990static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1991 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1992 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1993}
1994
1995static Constant* SegmentOffset(IRBuilder<> &IRB,
1996 unsigned Offset, unsigned AddressSpace) {
1997 return ConstantExpr::getIntToPtr(
1998 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1999 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2000}
2001
2002Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2003 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2004 // tcbhead_t; use it instead of the usual global variable (see
2005 // sysdeps/{i386,x86_64}/nptl/tls.h)
2006 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2007 if (Subtarget.isTargetFuchsia()) {
2008 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2009 return SegmentOffset(IRB, 0x10, getAddressSpace());
2010 } else {
2011 // %fs:0x28, unless we're using a Kernel code model, in which case
2012 // it's %gs:0x28. gs:0x14 on i386.
2013 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2014 return SegmentOffset(IRB, Offset, getAddressSpace());
2015 }
2016 }
2017
2018 return TargetLowering::getIRStackGuard(IRB);
2019}
2020
2021void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2022 // MSVC CRT provides functionalities for stack protection.
2023 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2024 // MSVC CRT has a global variable holding security cookie.
2025 M.getOrInsertGlobal("__security_cookie",
2026 Type::getInt8PtrTy(M.getContext()));
2027
2028 // MSVC CRT has a function to validate security cookie.
2029 auto *SecurityCheckCookie = cast<Function>(
2030 M.getOrInsertFunction("__security_check_cookie",
2031 Type::getVoidTy(M.getContext()),
2032 Type::getInt8PtrTy(M.getContext())));
2033 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2034 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2035 return;
2036 }
2037 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2038 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2039 return;
2040 TargetLowering::insertSSPDeclarations(M);
2041}
2042
2043Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2044 // MSVC CRT has a global variable holding security cookie.
2045 if (Subtarget.getTargetTriple().isOSMSVCRT())
2046 return M.getGlobalVariable("__security_cookie");
2047 return TargetLowering::getSDagStackGuard(M);
2048}
2049
2050Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2051 // MSVC CRT has a function to validate security cookie.
2052 if (Subtarget.getTargetTriple().isOSMSVCRT())
2053 return M.getFunction("__security_check_cookie");
2054 return TargetLowering::getSSPStackGuardCheck(M);
2055}
2056
2057Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2058 if (Subtarget.getTargetTriple().isOSContiki())
2059 return getDefaultSafeStackPointerLocation(IRB, false);
2060
2061 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2062 // definition of TLS_SLOT_SAFESTACK in
2063 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2064 if (Subtarget.isTargetAndroid()) {
2065 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2066 // %gs:0x24 on i386
2067 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2068 return SegmentOffset(IRB, Offset, getAddressSpace());
2069 }
2070
2071 // Fuchsia is similar.
2072 if (Subtarget.isTargetFuchsia()) {
2073 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2074 return SegmentOffset(IRB, 0x18, getAddressSpace());
2075 }
2076
2077 return TargetLowering::getSafeStackPointerLocation(IRB);
2078}
2079
2080bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2081 unsigned DestAS) const {
2082 assert(SrcAS != DestAS && "Expected different address spaces!")(static_cast <bool> (SrcAS != DestAS && "Expected different address spaces!"
) ? void (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2082, __extension__ __PRETTY_FUNCTION__))
;
2083
2084 return SrcAS < 256 && DestAS < 256;
2085}
2086
2087//===----------------------------------------------------------------------===//
2088// Return Value Calling Convention Implementation
2089//===----------------------------------------------------------------------===//
2090
2091#include "X86GenCallingConv.inc"
2092
2093bool X86TargetLowering::CanLowerReturn(
2094 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2095 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2096 SmallVector<CCValAssign, 16> RVLocs;
2097 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2098 return CCInfo.CheckReturn(Outs, RetCC_X86);
2099}
2100
2101const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2102 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2103 return ScratchRegs;
2104}
2105
2106/// Lowers masks values (v*i1) to the local register values
2107/// \returns DAG node after lowering to register type
2108static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2109 const SDLoc &Dl, SelectionDAG &DAG) {
2110 EVT ValVT = ValArg.getValueType();
2111
2112 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2113 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2114 // Two stage lowering might be required
2115 // bitcast: v8i1 -> i8 / v16i1 -> i16
2116 // anyextend: i8 -> i32 / i16 -> i32
2117 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2118 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2119 if (ValLoc == MVT::i32)
2120 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2121 return ValToCopy;
2122 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2123 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2124 // One stage lowering is required
2125 // bitcast: v32i1 -> i32 / v64i1 -> i64
2126 return DAG.getBitcast(ValLoc, ValArg);
2127 } else
2128 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2129}
2130
2131/// Breaks v64i1 value into two registers and adds the new node to the DAG
2132static void Passv64i1ArgInRegs(
2133 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2134 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2135 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2136 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2136, __extension__ __PRETTY_FUNCTION__))
;
2137 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2137, __extension__ __PRETTY_FUNCTION__))
;
2138 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2138, __extension__ __PRETTY_FUNCTION__))
;
2139 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2140, __extension__ __PRETTY_FUNCTION__))
2140 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2140, __extension__ __PRETTY_FUNCTION__))
;
2141
2142 // Before splitting the value we cast it to i64
2143 Arg = DAG.getBitcast(MVT::i64, Arg);
2144
2145 // Splitting the value into two i32 types
2146 SDValue Lo, Hi;
2147 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148 DAG.getConstant(0, Dl, MVT::i32));
2149 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2150 DAG.getConstant(1, Dl, MVT::i32));
2151
2152 // Attach the two i32 types into corresponding registers
2153 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2154 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2155}
2156
2157SDValue
2158X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2159 bool isVarArg,
2160 const SmallVectorImpl<ISD::OutputArg> &Outs,
2161 const SmallVectorImpl<SDValue> &OutVals,
2162 const SDLoc &dl, SelectionDAG &DAG) const {
2163 MachineFunction &MF = DAG.getMachineFunction();
2164 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2165
2166 // In some cases we need to disable registers from the default CSR list.
2167 // For example, when they are used for argument passing.
2168 bool ShouldDisableCalleeSavedRegister =
2169 CallConv == CallingConv::X86_RegCall ||
2170 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2171
2172 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2173 report_fatal_error("X86 interrupts may not return any value");
2174
2175 SmallVector<CCValAssign, 16> RVLocs;
2176 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2177 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2178
2179 SDValue Flag;
2180 SmallVector<SDValue, 6> RetOps;
2181 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2182 // Operand #1 = Bytes To Pop
2183 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2184 MVT::i32));
2185
2186 // Copy the result values into the output registers.
2187 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2188 ++I, ++OutsIndex) {
2189 CCValAssign &VA = RVLocs[I];
2190 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2190, __extension__ __PRETTY_FUNCTION__))
;
2191
2192 // Add the register to the CalleeSaveDisableRegs list.
2193 if (ShouldDisableCalleeSavedRegister)
2194 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2195
2196 SDValue ValToCopy = OutVals[OutsIndex];
2197 EVT ValVT = ValToCopy.getValueType();
2198
2199 // Promote values to the appropriate types.
2200 if (VA.getLocInfo() == CCValAssign::SExt)
2201 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2202 else if (VA.getLocInfo() == CCValAssign::ZExt)
2203 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2204 else if (VA.getLocInfo() == CCValAssign::AExt) {
2205 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2206 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2207 else
2208 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2209 }
2210 else if (VA.getLocInfo() == CCValAssign::BCvt)
2211 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2212
2213 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2214, __extension__ __PRETTY_FUNCTION__))
2214 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2214, __extension__ __PRETTY_FUNCTION__))
;
2215
2216 // If this is x86-64, and we disabled SSE, we can't return FP values,
2217 // or SSE or MMX vectors.
2218 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2219 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2220 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2221 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2222 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2223 } else if (ValVT == MVT::f64 &&
2224 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2225 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2226 // llvm-gcc has never done it right and no one has noticed, so this
2227 // should be OK for now.
2228 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2229 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2230 }
2231
2232 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2233 // the RET instruction and handled by the FP Stackifier.
2234 if (VA.getLocReg() == X86::FP0 ||
2235 VA.getLocReg() == X86::FP1) {
2236 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2237 // change the value to the FP stack register class.
2238 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2239 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2240 RetOps.push_back(ValToCopy);
2241 // Don't emit a copytoreg.
2242 continue;
2243 }
2244
2245 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2246 // which is returned in RAX / RDX.
2247 if (Subtarget.is64Bit()) {
2248 if (ValVT == MVT::x86mmx) {
2249 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2250 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2251 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2252 ValToCopy);
2253 // If we don't have SSE2 available, convert to v4f32 so the generated
2254 // register is legal.
2255 if (!Subtarget.hasSSE2())
2256 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2257 }
2258 }
2259 }
2260
2261 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2262
2263 if (VA.needsCustom()) {
2264 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))
2265 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2265, __extension__ __PRETTY_FUNCTION__))
;
2266
2267 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2268 Subtarget);
2269
2270 assert(2 == RegsToPass.size() &&(static_cast <bool> (2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? void (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2271, __extension__ __PRETTY_FUNCTION__))
2271 "Expecting two registers after Pass64BitArgInRegs")(static_cast <bool> (2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? void (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2271, __extension__ __PRETTY_FUNCTION__))
;
2272
2273 // Add the second register to the CalleeSaveDisableRegs list.
2274 if (ShouldDisableCalleeSavedRegister)
2275 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2276 } else {
2277 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2278 }
2279
2280 // Add nodes to the DAG and add the values into the RetOps list
2281 for (auto &Reg : RegsToPass) {
2282 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2283 Flag = Chain.getValue(1);
2284 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2285 }
2286 }
2287
2288 // Swift calling convention does not require we copy the sret argument
2289 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2290
2291 // All x86 ABIs require that for returning structs by value we copy
2292 // the sret argument into %rax/%eax (depending on ABI) for the return.
2293 // We saved the argument into a virtual register in the entry block,
2294 // so now we copy the value out and into %rax/%eax.
2295 //
2296 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2297 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2298 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2299 // either case FuncInfo->setSRetReturnReg() will have been called.
2300 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2301 // When we have both sret and another return value, we should use the
2302 // original Chain stored in RetOps[0], instead of the current Chain updated
2303 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2304
2305 // For the case of sret and another return value, we have
2306 // Chain_0 at the function entry
2307 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2308 // If we use Chain_1 in getCopyFromReg, we will have
2309 // Val = getCopyFromReg(Chain_1)
2310 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2311
2312 // getCopyToReg(Chain_0) will be glued together with
2313 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2314 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2315 // Data dependency from Unit B to Unit A due to usage of Val in
2316 // getCopyToReg(Chain_1, Val)
2317 // Chain dependency from Unit A to Unit B
2318
2319 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2320 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2321 getPointerTy(MF.getDataLayout()));
2322
2323 unsigned RetValReg
2324 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2325 X86::RAX : X86::EAX;
2326 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2327 Flag = Chain.getValue(1);
2328
2329 // RAX/EAX now acts like a return value.
2330 RetOps.push_back(
2331 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2332
2333 // Add the returned register to the CalleeSaveDisableRegs list.
2334 if (ShouldDisableCalleeSavedRegister)
2335 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2336 }
2337
2338 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2339 const MCPhysReg *I =
2340 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2341 if (I) {
2342 for (; *I; ++I) {
2343 if (X86::GR64RegClass.contains(*I))
2344 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2345 else
2346 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2346)
;
2347 }
2348 }
2349
2350 RetOps[0] = Chain; // Update chain.
2351
2352 // Add the flag if we have it.
2353 if (Flag.getNode())
2354 RetOps.push_back(Flag);
2355
2356 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2357 if (CallConv == CallingConv::X86_INTR)
2358 opcode = X86ISD::IRET;
2359 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2360}
2361
2362bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2363 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2364 return false;
2365
2366 SDValue TCChain = Chain;
2367 SDNode *Copy = *N->use_begin();
2368 if (Copy->getOpcode() == ISD::CopyToReg) {
2369 // If the copy has a glue operand, we conservatively assume it isn't safe to
2370 // perform a tail call.
2371 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2372 return false;
2373 TCChain = Copy->getOperand(0);
2374 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2375 return false;
2376
2377 bool HasRet = false;
2378 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2379 UI != UE; ++UI) {
2380 if (UI->getOpcode() != X86ISD::RET_FLAG)
2381 return false;
2382 // If we are returning more than one value, we can definitely
2383 // not make a tail call see PR19530
2384 if (UI->getNumOperands() > 4)
2385 return false;
2386 if (UI->getNumOperands() == 4 &&
2387 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2388 return false;
2389 HasRet = true;
2390 }
2391
2392 if (!HasRet)
2393 return false;
2394
2395 Chain = TCChain;
2396 return true;
2397}
2398
2399EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2400 ISD::NodeType ExtendKind) const {
2401 MVT ReturnMVT = MVT::i32;
2402
2403 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2404 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2405 // The ABI does not require i1, i8 or i16 to be extended.
2406 //
2407 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2408 // always extending i8/i16 return values, so keep doing that for now.
2409 // (PR26665).
2410 ReturnMVT = MVT::i8;
2411 }
2412
2413 EVT MinVT = getRegisterType(Context, ReturnMVT);
2414 return VT.bitsLT(MinVT) ? MinVT : VT;
2415}
2416
2417/// Reads two 32 bit registers and creates a 64 bit mask value.
2418/// \param VA The current 32 bit value that need to be assigned.
2419/// \param NextVA The next 32 bit value that need to be assigned.
2420/// \param Root The parent DAG node.
2421/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2422/// glue purposes. In the case the DAG is already using
2423/// physical register instead of virtual, we should glue
2424/// our new SDValue to InFlag SDvalue.
2425/// \return a new SDvalue of size 64bit.
2426static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2427 SDValue &Root, SelectionDAG &DAG,
2428 const SDLoc &Dl, const X86Subtarget &Subtarget,
2429 SDValue *InFlag = nullptr) {
2430 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2430, __extension__ __PRETTY_FUNCTION__))
;
2431 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2431, __extension__ __PRETTY_FUNCTION__))
;
2432 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2433, __extension__ __PRETTY_FUNCTION__))
2433 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2433, __extension__ __PRETTY_FUNCTION__))
;
2434 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2435, __extension__ __PRETTY_FUNCTION__))
2435 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2435, __extension__ __PRETTY_FUNCTION__))
;
2436 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2437, __extension__ __PRETTY_FUNCTION__))
2437 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2437, __extension__ __PRETTY_FUNCTION__))
;
2438
2439 SDValue Lo, Hi;
2440 unsigned Reg;
2441 SDValue ArgValueLo, ArgValueHi;
2442
2443 MachineFunction &MF = DAG.getMachineFunction();
2444 const TargetRegisterClass *RC = &X86::GR32RegClass;
2445
2446 // Read a 32 bit value from the registers
2447 if (nullptr == InFlag) {
2448 // When no physical register is present,
2449 // create an intermediate virtual register
2450 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2451 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2452 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2453 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2454 } else {
2455 // When a physical register is available read the value from it and glue
2456 // the reads together.
2457 ArgValueLo =
2458 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2459 *InFlag = ArgValueLo.getValue(2);
2460 ArgValueHi =
2461 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2462 *InFlag = ArgValueHi.getValue(2);
2463 }
2464
2465 // Convert the i32 type into v32i1 type
2466 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2467
2468 // Convert the i32 type into v32i1 type
2469 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2470
2471 // Concatenate the two values together
2472 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2473}
2474
2475/// The function will lower a register of various sizes (8/16/32/64)
2476/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2477/// \returns a DAG node contains the operand after lowering to mask type.
2478static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2479 const EVT &ValLoc, const SDLoc &Dl,
2480 SelectionDAG &DAG) {
2481 SDValue ValReturned = ValArg;
2482
2483 if (ValVT == MVT::v1i1)
2484 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2485
2486 if (ValVT == MVT::v64i1) {
2487 // In 32 bit machine, this case is handled by getv64i1Argument
2488 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2488, __extension__ __PRETTY_FUNCTION__))
;
2489 // In 64 bit machine, There is no need to truncate the value only bitcast
2490 } else {
2491 MVT maskLen;
2492 switch (ValVT.getSimpleVT().SimpleTy) {
2493 case MVT::v8i1:
2494 maskLen = MVT::i8;
2495 break;
2496 case MVT::v16i1:
2497 maskLen = MVT::i16;
2498 break;
2499 case MVT::v32i1:
2500 maskLen = MVT::i32;
2501 break;
2502 default:
2503 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2503)
;
2504 }
2505
2506 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2507 }
2508 return DAG.getBitcast(ValVT, ValReturned);
2509}
2510
2511/// Lower the result values of a call into the
2512/// appropriate copies out of appropriate physical registers.
2513///
2514SDValue X86TargetLowering::LowerCallResult(
2515 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2516 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2517 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2518 uint32_t *RegMask) const {
2519
2520 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2521 // Assign locations to each value returned by this call.
2522 SmallVector<CCValAssign, 16> RVLocs;
2523 bool Is64Bit = Subtarget.is64Bit();
2524 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2525 *DAG.getContext());
2526 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2527
2528 // Copy all of the result registers out of their specified physreg.
2529 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2530 ++I, ++InsIndex) {
2531 CCValAssign &VA = RVLocs[I];
2532 EVT CopyVT = VA.getLocVT();
2533
2534 // In some calling conventions we need to remove the used registers
2535 // from the register mask.
2536 if (RegMask) {
2537 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2538 SubRegs.isValid(); ++SubRegs)
2539 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2540 }
2541
2542 // If this is x86-64, and we disabled SSE, we can't return FP values
2543 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2544 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2545 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2546 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2547 }
2548
2549 // If we prefer to use the value in xmm registers, copy it out as f80 and
2550 // use a truncate to move it from fp stack reg to xmm reg.
2551 bool RoundAfterCopy = false;
2552 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2553 isScalarFPTypeInSSEReg(VA.getValVT())) {
2554 if (!Subtarget.hasX87())
2555 report_fatal_error("X87 register return with X87 disabled");
2556 CopyVT = MVT::f80;
2557 RoundAfterCopy = (CopyVT != VA.getLocVT());
2558 }
2559
2560 SDValue Val;
2561 if (VA.needsCustom()) {
2562 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2563, __extension__ __PRETTY_FUNCTION__))
2563 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2563, __extension__ __PRETTY_FUNCTION__))
;
2564 Val =
2565 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2566 } else {
2567 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2568 .getValue(1);
2569 Val = Chain.getValue(0);
2570 InFlag = Chain.getValue(2);
2571 }
2572
2573 if (RoundAfterCopy)
2574 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2575 // This truncation won't change the value.
2576 DAG.getIntPtrConstant(1, dl));
2577
2578 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2579 if (VA.getValVT().isVector() &&
2580 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2581 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2582 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2583 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2584 } else
2585 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2586 }
2587
2588 InVals.push_back(Val);
2589 }
2590
2591 return Chain;
2592}
2593
2594//===----------------------------------------------------------------------===//
2595// C & StdCall & Fast Calling Convention implementation
2596//===----------------------------------------------------------------------===//
2597// StdCall calling convention seems to be standard for many Windows' API
2598// routines and around. It differs from C calling convention just a little:
2599// callee should clean up the stack, not caller. Symbols should be also
2600// decorated in some fancy way :) It doesn't support any vector arguments.
2601// For info on fast calling convention see Fast Calling Convention (tail call)
2602// implementation LowerX86_32FastCCCallTo.
2603
2604/// CallIsStructReturn - Determines whether a call uses struct return
2605/// semantics.
2606enum StructReturnType {
2607 NotStructReturn,
2608 RegStructReturn,
2609 StackStructReturn
2610};
2611static StructReturnType
2612callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2613 if (Outs.empty())
2614 return NotStructReturn;
2615
2616 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2617 if (!Flags.isSRet())
2618 return NotStructReturn;
2619 if (Flags.isInReg() || IsMCU)
2620 return RegStructReturn;
2621 return StackStructReturn;
2622}
2623
2624/// Determines whether a function uses struct return semantics.
2625static StructReturnType
2626argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2627 if (Ins.empty())
2628 return NotStructReturn;
2629
2630 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2631 if (!Flags.isSRet())
2632 return NotStructReturn;
2633 if (Flags.isInReg() || IsMCU)
2634 return RegStructReturn;
2635 return StackStructReturn;
2636}
2637
2638/// Make a copy of an aggregate at address specified by "Src" to address
2639/// "Dst" with size and alignment information specified by the specific
2640/// parameter attribute. The copy will be passed as a byval function parameter.
2641static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2642 SDValue Chain, ISD::ArgFlagsTy Flags,
2643 SelectionDAG &DAG, const SDLoc &dl) {
2644 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2645
2646 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2647 /*isVolatile*/false, /*AlwaysInline=*/true,
2648 /*isTailCall*/false,
2649 MachinePointerInfo(), MachinePointerInfo());
2650}
2651
2652/// Return true if the calling convention is one that we can guarantee TCO for.
2653static bool canGuaranteeTCO(CallingConv::ID CC) {
2654 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2655 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2656 CC == CallingConv::HHVM);
2657}
2658
2659/// Return true if we might ever do TCO for calls with this calling convention.
2660static bool mayTailCallThisCC(CallingConv::ID CC) {
2661 switch (CC) {
2662 // C calling conventions:
2663 case CallingConv::C:
2664 case CallingConv::Win64:
2665 case CallingConv::X86_64_SysV:
2666 // Callee pop conventions:
2667 case CallingConv::X86_ThisCall:
2668 case CallingConv::X86_StdCall:
2669 case CallingConv::X86_VectorCall:
2670 case CallingConv::X86_FastCall:
2671 return true;
2672 default:
2673 return canGuaranteeTCO(CC);
2674 }
2675}
2676
2677/// Return true if the function is being made into a tailcall target by
2678/// changing its ABI.
2679static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2680 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2681}
2682
2683bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2684 auto Attr =
2685 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2686 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2687 return false;
2688
2689 ImmutableCallSite CS(CI);
2690 CallingConv::ID CalleeCC = CS.getCallingConv();
2691 if (!mayTailCallThisCC(CalleeCC))
2692 return false;
2693
2694 return true;
2695}
2696
2697SDValue
2698X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2699 const SmallVectorImpl<ISD::InputArg> &Ins,
2700 const SDLoc &dl, SelectionDAG &DAG,
2701 const CCValAssign &VA,
2702 MachineFrameInfo &MFI, unsigned i) const {
2703 // Create the nodes corresponding to a load from this parameter slot.
2704 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2705 bool AlwaysUseMutable = shouldGuaranteeTCO(
2706 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2707 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2708 EVT ValVT;
2709 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2710
2711 // If value is passed by pointer we have address passed instead of the value
2712 // itself. No need to extend if the mask value and location share the same
2713 // absolute size.
2714 bool ExtendedInMem =
2715 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2716 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2717
2718 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2719 ValVT = VA.getLocVT();
2720 else
2721 ValVT = VA.getValVT();
2722
2723 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2724 // taken by a return address.
2725 int Offset = 0;
2726 if (CallConv == CallingConv::X86_INTR) {
2727 // X86 interrupts may take one or two arguments.
2728 // On the stack there will be no return address as in regular call.
2729 // Offset of last argument need to be set to -4/-8 bytes.
2730 // Where offset of the first argument out of two, should be set to 0 bytes.
2731 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2732 if (Subtarget.is64Bit() && Ins.size() == 2) {
2733 // The stack pointer needs to be realigned for 64 bit handlers with error
2734 // code, so the argument offset changes by 8 bytes.
2735 Offset += 8;
2736 }
2737 }
2738
2739 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2740 // changed with more analysis.
2741 // In case of tail call optimization mark all arguments mutable. Since they
2742 // could be overwritten by lowering of arguments in case of a tail call.
2743 if (Flags.isByVal()) {
2744 unsigned Bytes = Flags.getByValSize();
2745 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2746 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2747 // Adjust SP offset of interrupt parameter.
2748 if (CallConv == CallingConv::X86_INTR) {
2749 MFI.setObjectOffset(FI, Offset);
2750 }
2751 return DAG.getFrameIndex(FI, PtrVT);
2752 }
2753
2754 // This is an argument in memory. We might be able to perform copy elision.
2755 if (Flags.isCopyElisionCandidate()) {
2756 EVT ArgVT = Ins[i].ArgVT;
2757 SDValue PartAddr;
2758 if (Ins[i].PartOffset == 0) {
2759 // If this is a one-part value or the first part of a multi-part value,
2760 // create a stack object for the entire argument value type and return a
2761 // load from our portion of it. This assumes that if the first part of an
2762 // argument is in memory, the rest will also be in memory.
2763 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2764 /*Immutable=*/false);
2765 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2766 return DAG.getLoad(
2767 ValVT, dl, Chain, PartAddr,
2768 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2769 } else {
2770 // This is not the first piece of an argument in memory. See if there is
2771 // already a fixed stack object including this offset. If so, assume it
2772 // was created by the PartOffset == 0 branch above and create a load from
2773 // the appropriate offset into it.
2774 int64_t PartBegin = VA.getLocMemOffset();
2775 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2776 int FI = MFI.getObjectIndexBegin();
2777 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2778 int64_t ObjBegin = MFI.getObjectOffset(FI);
2779 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2780 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2781 break;
2782 }
2783 if (MFI.isFixedObjectIndex(FI)) {
2784 SDValue Addr =
2785 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2786 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2787 return DAG.getLoad(
2788 ValVT, dl, Chain, Addr,
2789 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2790 Ins[i].PartOffset));
2791 }
2792 }
2793 }
2794
2795 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2796 VA.getLocMemOffset(), isImmutable);
2797
2798 // Set SExt or ZExt flag.
2799 if (VA.getLocInfo() == CCValAssign::ZExt) {
2800 MFI.setObjectZExt(FI, true);
2801 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2802 MFI.setObjectSExt(FI, true);
2803 }
2804
2805 // Adjust SP offset of interrupt parameter.
2806 if (CallConv == CallingConv::X86_INTR) {
2807 MFI.setObjectOffset(FI, Offset);
2808 }
2809
2810 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2811 SDValue Val = DAG.getLoad(
2812 ValVT, dl, Chain, FIN,
2813 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2814 return ExtendedInMem
2815 ? (VA.getValVT().isVector()
2816 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2817 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2818 : Val;
2819}
2820
2821// FIXME: Get this from tablegen.
2822static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2823 const X86Subtarget &Subtarget) {
2824 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2824, __extension__ __PRETTY_FUNCTION__))
;
2825
2826 if (Subtarget.isCallingConvWin64(CallConv)) {
2827 static const MCPhysReg GPR64ArgRegsWin64[] = {
2828 X86::RCX, X86::RDX, X86::R8, X86::R9
2829 };
2830 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2831 }
2832
2833 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2834 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2835 };
2836 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2837}
2838
2839// FIXME: Get this from tablegen.
2840static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2841 CallingConv::ID CallConv,
2842 const X86Subtarget &Subtarget) {
2843 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2843, __extension__ __PRETTY_FUNCTION__))
;
2844 if (Subtarget.isCallingConvWin64(CallConv)) {
2845 // The XMM registers which might contain var arg parameters are shadowed
2846 // in their paired GPR. So we only need to save the GPR to their home
2847 // slots.
2848 // TODO: __vectorcall will change this.
2849 return None;
2850 }
2851
2852 const Function *Fn = MF.getFunction();
2853 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2854 bool isSoftFloat = Subtarget.useSoftFloat();
2855 assert(!(isSoftFloat && NoImplicitFloatOps) &&(static_cast <bool> (!(isSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2856, __extension__ __PRETTY_FUNCTION__))
2856 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(isSoftFloat && NoImplicitFloatOps
) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2856, __extension__ __PRETTY_FUNCTION__))
;
2857 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2858 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2859 // registers.
2860 return None;
2861
2862 static const MCPhysReg XMMArgRegs64Bit[] = {
2863 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2864 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2865 };
2866 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2867}
2868
2869#ifndef NDEBUG
2870static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2871 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2872 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2873 return A.getValNo() < B.getValNo();
2874 });
2875}
2876#endif
2877
2878SDValue X86TargetLowering::LowerFormalArguments(
2879 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2880 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2881 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2882 MachineFunction &MF = DAG.getMachineFunction();
2883 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2884 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2885
2886 const Function *Fn = MF.getFunction();
2887 if (Fn->hasExternalLinkage() &&
2888 Subtarget.isTargetCygMing() &&
2889 Fn->getName() == "main")
2890 FuncInfo->setForceFramePointer(true);
2891
2892 MachineFrameInfo &MFI = MF.getFrameInfo();
2893 bool Is64Bit = Subtarget.is64Bit();
2894 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2895
2896 assert((static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))
2897 !(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))
2898 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))
;
2899
2900 if (CallConv == CallingConv::X86_INTR) {
2901 bool isLegal = Ins.size() == 1 ||
2902 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2903 (!Is64Bit && Ins[1].VT == MVT::i32)));
2904 if (!isLegal)
2905 report_fatal_error("X86 interrupts may take one or two arguments");
2906 }
2907
2908 // Assign locations to all of the incoming arguments.
2909 SmallVector<CCValAssign, 16> ArgLocs;
2910 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2911
2912 // Allocate shadow area for Win64.
2913 if (IsWin64)
2914 CCInfo.AllocateStack(32, 8);
2915
2916 CCInfo.AnalyzeArguments(Ins, CC_X86);
2917
2918 // In vectorcall calling convention a second pass is required for the HVA
2919 // types.
2920 if (CallingConv::X86_VectorCall == CallConv) {
2921 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2922 }
2923
2924 // The next loop assumes that the locations are in the same order of the
2925 // input arguments.
2926 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2927, __extension__ __PRETTY_FUNCTION__))
2927 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2927, __extension__ __PRETTY_FUNCTION__))
;
2928
2929 SDValue ArgValue;
2930 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2931 ++I, ++InsIndex) {
2932 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2932, __extension__ __PRETTY_FUNCTION__))
;
2933 CCValAssign &VA = ArgLocs[I];
2934
2935 if (VA.isRegLoc()) {
2936 EVT RegVT = VA.getLocVT();
2937 if (VA.needsCustom()) {
2938 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __extension__ __PRETTY_FUNCTION__))
2939 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __extension__ __PRETTY_FUNCTION__))
2940 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __extension__ __PRETTY_FUNCTION__))
;
2941
2942 // v64i1 values, in regcall calling convention, that are
2943 // compiled to 32 bit arch, are split up into two registers.
2944 ArgValue =
2945 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2946 } else {
2947 const TargetRegisterClass *RC;
2948 if (RegVT == MVT::i32)
2949 RC = &X86::GR32RegClass;
2950 else if (Is64Bit && RegVT == MVT::i64)
2951 RC = &X86::GR64RegClass;
2952 else if (RegVT == MVT::f32)
2953 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2954 else if (RegVT == MVT::f64)
2955 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2956 else if (RegVT == MVT::f80)
2957 RC = &X86::RFP80RegClass;
2958 else if (RegVT == MVT::f128)
2959 RC = &X86::FR128RegClass;
2960 else if (RegVT.is512BitVector())
2961 RC = &X86::VR512RegClass;
2962 else if (RegVT.is256BitVector())
2963 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2964 else if (RegVT.is128BitVector())
2965 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2966 else if (RegVT == MVT::x86mmx)
2967 RC = &X86::VR64RegClass;
2968 else if (RegVT == MVT::v1i1)
2969 RC = &X86::VK1RegClass;
2970 else if (RegVT == MVT::v8i1)
2971 RC = &X86::VK8RegClass;
2972 else if (RegVT == MVT::v16i1)
2973 RC = &X86::VK16RegClass;
2974 else if (RegVT == MVT::v32i1)
2975 RC = &X86::VK32RegClass;
2976 else if (RegVT == MVT::v64i1)
2977 RC = &X86::VK64RegClass;
2978 else
2979 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 2979)
;
2980
2981 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2982 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2983 }
2984
2985 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2986 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2987 // right size.
2988 if (VA.getLocInfo() == CCValAssign::SExt)
2989 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2990 DAG.getValueType(VA.getValVT()));
2991 else if (VA.getLocInfo() == CCValAssign::ZExt)
2992 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2993 DAG.getValueType(VA.getValVT()));
2994 else if (VA.getLocInfo() == CCValAssign::BCvt)
2995 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2996
2997 if (VA.isExtInLoc()) {
2998 // Handle MMX values passed in XMM regs.
2999 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3000 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3001 else if (VA.getValVT().isVector() &&
3002 VA.getValVT().getScalarType() == MVT::i1 &&
3003 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3004 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3005 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3006 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3007 } else
3008 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3009 }
3010 } else {
3011 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3011, __extension__ __PRETTY_FUNCTION__))
;
3012 ArgValue =
3013 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3014 }
3015
3016 // If value is passed via pointer - do a load.
3017 if (VA.getLocInfo() == CCValAssign::Indirect)
3018 ArgValue =
3019 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3020
3021 InVals.push_back(ArgValue);
3022 }
3023
3024 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3025 // Swift calling convention does not require we copy the sret argument
3026 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3027 if (CallConv == CallingConv::Swift)
3028 continue;
3029
3030 // All x86 ABIs require that for returning structs by value we copy the
3031 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3032 // the argument into a virtual register so that we can access it from the
3033 // return points.
3034 if (Ins[I].Flags.isSRet()) {
3035 unsigned Reg = FuncInfo->getSRetReturnReg();
3036 if (!Reg) {
3037 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3038 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3039 FuncInfo->setSRetReturnReg(Reg);
3040 }
3041 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3042 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3043 break;
3044 }
3045 }
3046
3047 unsigned StackSize = CCInfo.getNextStackOffset();
3048 // Align stack specially for tail calls.
3049 if (shouldGuaranteeTCO(CallConv,
3050 MF.getTarget().Options.GuaranteedTailCallOpt))
3051 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3052
3053 // If the function takes variable number of arguments, make a frame index for
3054 // the start of the first vararg value... for expansion of llvm.va_start. We
3055 // can skip this if there are no va_start calls.
3056 if (MFI.hasVAStart() &&
3057 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3058 CallConv != CallingConv::X86_ThisCall))) {
3059 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3060 }
3061
3062 // Figure out if XMM registers are in use.
3063 assert(!(Subtarget.useSoftFloat() &&(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__))
3064 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__))
3065 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(Subtarget.useSoftFloat() &&
Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!") ? void (
0) : __assert_fail ("!(Subtarget.useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3065, __extension__ __PRETTY_FUNCTION__))
;
3066
3067 // 64-bit calling conventions support varargs and register parameters, so we
3068 // have to do extra work to spill them in the prologue.
3069 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3070 // Find the first unallocated argument registers.
3071 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3072 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3073 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3074 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3075 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3076, __extension__ __PRETTY_FUNCTION__))
3076 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3076, __extension__ __PRETTY_FUNCTION__))
;
3077
3078 // Gather all the live in physical registers.
3079 SmallVector<SDValue, 6> LiveGPRs;
3080 SmallVector<SDValue, 8> LiveXMMRegs;
3081 SDValue ALVal;
3082 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3083 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3084 LiveGPRs.push_back(
3085 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3086 }
3087 if (!ArgXMMs.empty()) {
3088 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3089 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3090 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3091 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3092 LiveXMMRegs.push_back(
3093 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3094 }
3095 }
3096
3097 if (IsWin64) {
3098 // Get to the caller-allocated home save location. Add 8 to account
3099 // for the return address.
3100 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3101 FuncInfo->setRegSaveFrameIndex(
3102 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3103 // Fixup to set vararg frame on shadow area (4 x i64).
3104 if (NumIntRegs < 4)
3105 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3106 } else {
3107 // For X86-64, if there are vararg parameters that are passed via
3108 // registers, then we must store them to their spots on the stack so
3109 // they may be loaded by dereferencing the result of va_next.
3110 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3111 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3112 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3113 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3114 }
3115
3116 // Store the integer parameter registers.
3117 SmallVector<SDValue, 8> MemOps;
3118 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3119 getPointerTy(DAG.getDataLayout()));
3120 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3121 for (SDValue Val : LiveGPRs) {
3122 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3123 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3124 SDValue Store =
3125 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3126 MachinePointerInfo::getFixedStack(
3127 DAG.getMachineFunction(),
3128 FuncInfo->getRegSaveFrameIndex(), Offset));
3129 MemOps.push_back(Store);
3130 Offset += 8;
3131 }
3132
3133 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3134 // Now store the XMM (fp + vector) parameter registers.
3135 SmallVector<SDValue, 12> SaveXMMOps;
3136 SaveXMMOps.push_back(Chain);
3137 SaveXMMOps.push_back(ALVal);
3138 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3139 FuncInfo->getRegSaveFrameIndex(), dl));
3140 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3141 FuncInfo->getVarArgsFPOffset(), dl));
3142 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3143 LiveXMMRegs.end());
3144 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3145 MVT::Other, SaveXMMOps));
3146 }
3147
3148 if (!MemOps.empty())
3149 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3150 }
3151
3152 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3153 // Find the largest legal vector type.
3154 MVT VecVT = MVT::Other;
3155 // FIXME: Only some x86_32 calling conventions support AVX512.
3156 if (Subtarget.hasAVX512() &&
3157 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3158 CallConv == CallingConv::Intel_OCL_BI)))
3159 VecVT = MVT::v16f32;
3160 else if (Subtarget.hasAVX())
3161 VecVT = MVT::v8f32;
3162 else if (Subtarget.hasSSE2())
3163 VecVT = MVT::v4f32;
3164
3165 // We forward some GPRs and some vector types.
3166 SmallVector<MVT, 2> RegParmTypes;
3167 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3168 RegParmTypes.push_back(IntVT);
3169 if (VecVT != MVT::Other)
3170 RegParmTypes.push_back(VecVT);
3171
3172 // Compute the set of forwarded registers. The rest are scratch.
3173 SmallVectorImpl<ForwardedRegister> &Forwards =
3174 FuncInfo->getForwardedMustTailRegParms();
3175 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3176
3177 // Conservatively forward AL on x86_64, since it might be used for varargs.
3178 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3179 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3180 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3181 }
3182
3183 // Copy all forwards from physical to virtual registers.
3184 for (ForwardedRegister &F : Forwards) {
3185 // FIXME: Can we use a less constrained schedule?
3186 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3187 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3188 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3189 }
3190 }
3191
3192 // Some CCs need callee pop.
3193 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3194 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3195 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3196 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3197 // X86 interrupts must pop the error code (and the alignment padding) if
3198 // present.
3199 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3200 } else {
3201 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3202 // If this is an sret function, the return should pop the hidden pointer.
3203 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3204 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3205 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3206 FuncInfo->setBytesToPopOnReturn(4);
3207 }
3208
3209 if (!Is64Bit) {
3210 // RegSaveFrameIndex is X86-64 only.
3211 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3212 if (CallConv == CallingConv::X86_FastCall ||
3213 CallConv == CallingConv::X86_ThisCall)
3214 // fastcc functions can't have varargs.
3215 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3216 }
3217
3218 FuncInfo->setArgumentStackSize(StackSize);
3219
3220 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3221 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3222 if (Personality == EHPersonality::CoreCLR) {
3223 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3223, __extension__ __PRETTY_FUNCTION__))
;
3224 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3225 // that we'd prefer this slot be allocated towards the bottom of the frame
3226 // (i.e. near the stack pointer after allocating the frame). Every
3227 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3228 // offset from the bottom of this and each funclet's frame must be the
3229 // same, so the size of funclets' (mostly empty) frames is dictated by
3230 // how far this slot is from the bottom (since they allocate just enough
3231 // space to accommodate holding this slot at the correct offset).
3232 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3233 EHInfo->PSPSymFrameIdx = PSPSymFI;
3234 }
3235 }
3236
3237 if (CallConv == CallingConv::X86_RegCall ||
3238 Fn->hasFnAttribute("no_caller_saved_registers")) {
3239 MachineRegisterInfo &MRI = MF.getRegInfo();
3240 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3241 MRI.disableCalleeSavedRegister(Pair.first);
3242 }
3243
3244 return Chain;
3245}
3246
3247SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3248 SDValue Arg, const SDLoc &dl,
3249 SelectionDAG &DAG,
3250 const CCValAssign &VA,
3251 ISD::ArgFlagsTy Flags) const {
3252 unsigned LocMemOffset = VA.getLocMemOffset();
3253 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3254 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3255 StackPtr, PtrOff);
3256 if (Flags.isByVal())
3257 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3258
3259 return DAG.getStore(
3260 Chain, dl, Arg, PtrOff,
3261 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3262}
3263
3264/// Emit a load of return address if tail call
3265/// optimization is performed and it is required.
3266SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3267 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3268 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3269 // Adjust the Return address stack slot.
3270 EVT VT = getPointerTy(DAG.getDataLayout());
3271 OutRetAddr = getReturnAddressFrameIndex(DAG);
3272
3273 // Load the "old" Return address.
3274 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3275 return SDValue(OutRetAddr.getNode(), 1);
3276}
3277
3278/// Emit a store of the return address if tail call
3279/// optimization is performed and it is required (FPDiff!=0).
3280static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3281 SDValue Chain, SDValue RetAddrFrIdx,
3282 EVT PtrVT, unsigned SlotSize,
3283 int FPDiff, const SDLoc &dl) {
3284 // Store the return address to the appropriate stack slot.
3285 if (!FPDiff) return Chain;
3286 // Calculate the new stack slot for the return address.
3287 int NewReturnAddrFI =
3288 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3289 false);
3290 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3291 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3292 MachinePointerInfo::getFixedStack(
3293 DAG.getMachineFunction(), NewReturnAddrFI));
3294 return Chain;
3295}
3296
3297/// Returns a vector_shuffle mask for an movs{s|d}, movd
3298/// operation of specified width.
3299static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3300 SDValue V2) {
3301 unsigned NumElems = VT.getVectorNumElements();
3302 SmallVector<int, 8> Mask;
3303 Mask.push_back(NumElems);
3304 for (unsigned i = 1; i != NumElems; ++i)
3305 Mask.push_back(i);
3306 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3307}
3308
3309SDValue
3310X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3311 SmallVectorImpl<SDValue> &InVals) const {
3312 SelectionDAG &DAG = CLI.DAG;
3313 SDLoc &dl = CLI.DL;
3314 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3315 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3316 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3317 SDValue Chain = CLI.Chain;
3318 SDValue Callee = CLI.Callee;
3319 CallingConv::ID CallConv = CLI.CallConv;
3320 bool &isTailCall = CLI.IsTailCall;
3321 bool isVarArg = CLI.IsVarArg;
3322
3323 MachineFunction &MF = DAG.getMachineFunction();
3324 bool Is64Bit = Subtarget.is64Bit();
3325 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3326 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3327 bool IsSibcall = false;
3328 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3329 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3330 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3331 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3332 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3333 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3334
3335 if (CallConv == CallingConv::X86_INTR)
3336 report_fatal_error("X86 interrupts may not be called directly");
3337
3338 if (Attr.getValueAsString() == "true")
3339 isTailCall = false;
3340
3341 if (Subtarget.isPICStyleGOT() &&
3342 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3343 // If we are using a GOT, disable tail calls to external symbols with
3344 // default visibility. Tail calling such a symbol requires using a GOT
3345 // relocation, which forces early binding of the symbol. This breaks code
3346 // that require lazy function symbol resolution. Using musttail or
3347 // GuaranteedTailCallOpt will override this.
3348 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3349 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3350 G->getGlobal()->hasDefaultVisibility()))
3351 isTailCall = false;
3352 }
3353
3354 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3355 if (IsMustTail) {
3356 // Force this to be a tail call. The verifier rules are enough to ensure
3357 // that we can lower this successfully without moving the return address
3358 // around.
3359 isTailCall = true;
3360 } else if (isTailCall) {
3361 // Check if it's really possible to do a tail call.
3362 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3363 isVarArg, SR != NotStructReturn,
3364 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3365 Outs, OutVals, Ins, DAG);
3366
3367 // Sibcalls are automatically detected tailcalls which do not require
3368 // ABI changes.
3369 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3370 IsSibcall = true;
3371
3372 if (isTailCall)
3373 ++NumTailCalls;
3374 }
3375
3376 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3377, __extension__ __PRETTY_FUNCTION__))
3377 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3377, __extension__ __PRETTY_FUNCTION__))
;
3378
3379 // Analyze operands of the call, assigning locations to each operand.
3380 SmallVector<CCValAssign, 16> ArgLocs;
3381 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3382
3383 // Allocate shadow area for Win64.
3384 if (IsWin64)
3385 CCInfo.AllocateStack(32, 8);
3386
3387 CCInfo.AnalyzeArguments(Outs, CC_X86);
3388
3389 // In vectorcall calling convention a second pass is required for the HVA
3390 // types.
3391 if (CallingConv::X86_VectorCall == CallConv) {
3392 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3393 }
3394
3395 // Get a count of how many bytes are to be pushed on the stack.
3396 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3397 if (IsSibcall)
3398 // This is a sibcall. The memory operands are available in caller's
3399 // own caller's stack.
3400 NumBytes = 0;
3401 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3402 canGuaranteeTCO(CallConv))
3403 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3404
3405 int FPDiff = 0;
3406 if (isTailCall && !IsSibcall && !IsMustTail) {
3407 // Lower arguments at fp - stackoffset + fpdiff.
3408 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3409
3410 FPDiff = NumBytesCallerPushed - NumBytes;
3411
3412 // Set the delta of movement of the returnaddr stackslot.
3413 // But only set if delta is greater than previous delta.
3414 if (FPDiff < X86Info->getTCReturnAddrDelta())
3415 X86Info->setTCReturnAddrDelta(FPDiff);
3416 }
3417
3418 unsigned NumBytesToPush = NumBytes;
3419 unsigned NumBytesToPop = NumBytes;
3420
3421 // If we have an inalloca argument, all stack space has already been allocated
3422 // for us and be right at the top of the stack. We don't support multiple
3423 // arguments passed in memory when using inalloca.
3424 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3425 NumBytesToPush = 0;
3426 if (!ArgLocs.back().isMemLoc())
3427 report_fatal_error("cannot use inalloca attribute on a register "
3428 "parameter");
3429 if (ArgLocs.back().getLocMemOffset() != 0)
3430 report_fatal_error("any parameter with the inalloca attribute must be "
3431 "the only memory argument");
3432 }
3433
3434 if (!IsSibcall)
3435 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3436 NumBytes - NumBytesToPush, dl);
3437
3438 SDValue RetAddrFrIdx;
3439 // Load return address for tail calls.
3440 if (isTailCall && FPDiff)
3441 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3442 Is64Bit, FPDiff, dl);
3443
3444 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3445 SmallVector<SDValue, 8> MemOpChains;
3446 SDValue StackPtr;
3447
3448 // The next loop assumes that the locations are in the same order of the
3449 // input arguments.
3450 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3451, __extension__ __PRETTY_FUNCTION__))
3451 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3451, __extension__ __PRETTY_FUNCTION__))
;
3452
3453 // Walk the register/memloc assignments, inserting copies/loads. In the case
3454 // of tail call optimization arguments are handle later.
3455 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3456 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3457 ++I, ++OutIndex) {
3458 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3458, __extension__ __PRETTY_FUNCTION__))
;
3459 // Skip inalloca arguments, they have already been written.
3460 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3461 if (Flags.isInAlloca())
3462 continue;
3463
3464 CCValAssign &VA = ArgLocs[I];
3465 EVT RegVT = VA.getLocVT();
3466 SDValue Arg = OutVals[OutIndex];
3467 bool isByVal = Flags.isByVal();
3468
3469 // Promote the value if needed.
3470 switch (VA.getLocInfo()) {
3471 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3471)
;
3472 case CCValAssign::Full: break;
3473 case CCValAssign::SExt:
3474 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3475 break;
3476 case CCValAssign::ZExt:
3477 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3478 break;
3479 case CCValAssign::AExt:
3480 if (Arg.getValueType().isVector() &&
3481 Arg.getValueType().getVectorElementType() == MVT::i1)
3482 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3483 else if (RegVT.is128BitVector()) {
3484 // Special case: passing MMX values in XMM registers.
3485 Arg = DAG.getBitcast(MVT::i64, Arg);
3486 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3487 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3488 } else
3489 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3490 break;
3491 case CCValAssign::BCvt:
3492 Arg = DAG.getBitcast(RegVT, Arg);
3493 break;
3494 case CCValAssign::Indirect: {
3495 // Store the argument.
3496 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3497 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3498 Chain = DAG.getStore(
3499 Chain, dl, Arg, SpillSlot,
3500 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3501 Arg = SpillSlot;
3502 break;
3503 }
3504 }
3505
3506 if (VA.needsCustom()) {
3507 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3508, __extension__ __PRETTY_FUNCTION__))
3508 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3508, __extension__ __PRETTY_FUNCTION__))
;
3509 // Split v64i1 value into two registers
3510 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3511 Subtarget);
3512 } else if (VA.isRegLoc()) {
3513 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3514 if (isVarArg && IsWin64) {
3515 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3516 // shadow reg if callee is a varargs function.
3517 unsigned ShadowReg = 0;
3518 switch (VA.getLocReg()) {
3519 case X86::XMM0: ShadowReg = X86::RCX; break;
3520 case X86::XMM1: ShadowReg = X86::RDX; break;
3521 case X86::XMM2: ShadowReg = X86::R8; break;
3522 case X86::XMM3: ShadowReg = X86::R9; break;
3523 }
3524 if (ShadowReg)
3525 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3526 }
3527 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3528 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3528, __extension__ __PRETTY_FUNCTION__))
;
3529 if (!StackPtr.getNode())
3530 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3531 getPointerTy(DAG.getDataLayout()));
3532 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3533 dl, DAG, VA, Flags));
3534 }
3535 }
3536
3537 if (!MemOpChains.empty())
3538 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3539
3540 if (Subtarget.isPICStyleGOT()) {
3541 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3542 // GOT pointer.
3543 if (!isTailCall) {
3544 RegsToPass.push_back(std::make_pair(
3545 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3546 getPointerTy(DAG.getDataLayout()))));
3547 } else {
3548 // If we are tail calling and generating PIC/GOT style code load the
3549 // address of the callee into ECX. The value in ecx is used as target of
3550 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3551 // for tail calls on PIC/GOT architectures. Normally we would just put the
3552 // address of GOT into ebx and then call target@PLT. But for tail calls
3553 // ebx would be restored (since ebx is callee saved) before jumping to the
3554 // target@PLT.
3555
3556 // Note: The actual moving to ECX is done further down.
3557 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3558 if (G && !G->getGlobal()->hasLocalLinkage() &&
3559 G->getGlobal()->hasDefaultVisibility())
3560 Callee = LowerGlobalAddress(Callee, DAG);
3561 else if (isa<ExternalSymbolSDNode>(Callee))
3562 Callee = LowerExternalSymbol(Callee, DAG);
3563 }
3564 }
3565
3566 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3567 // From AMD64 ABI document:
3568 // For calls that may call functions that use varargs or stdargs
3569 // (prototype-less calls or calls to functions containing ellipsis (...) in
3570 // the declaration) %al is used as hidden argument to specify the number
3571 // of SSE registers used. The contents of %al do not need to match exactly
3572 // the number of registers, but must be an ubound on the number of SSE
3573 // registers used and is in the range 0 - 8 inclusive.
3574
3575 // Count the number of XMM registers allocated.
3576 static const MCPhysReg XMMArgRegs[] = {
3577 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3578 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3579 };
3580 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3581 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3582, __extension__ __PRETTY_FUNCTION__))
3582 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3582, __extension__ __PRETTY_FUNCTION__))
;
3583
3584 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3585 DAG.getConstant(NumXMMRegs, dl,
3586 MVT::i8)));
3587 }
3588
3589 if (isVarArg && IsMustTail) {
3590 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3591 for (const auto &F : Forwards) {
3592 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3593 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3594 }
3595 }
3596
3597 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3598 // don't need this because the eligibility check rejects calls that require
3599 // shuffling arguments passed in memory.
3600 if (!IsSibcall && isTailCall) {
3601 // Force all the incoming stack arguments to be loaded from the stack
3602 // before any new outgoing arguments are stored to the stack, because the
3603 // outgoing stack slots may alias the incoming argument stack slots, and
3604 // the alias isn't otherwise explicit. This is slightly more conservative
3605 // than necessary, because it means that each store effectively depends
3606 // on every argument instead of just those arguments it would clobber.
3607 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3608
3609 SmallVector<SDValue, 8> MemOpChains2;
3610 SDValue FIN;
3611 int FI = 0;
3612 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3613 ++I, ++OutsIndex) {
3614 CCValAssign &VA = ArgLocs[I];
3615
3616 if (VA.isRegLoc()) {
3617 if (VA.needsCustom()) {
3618 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3619, __extension__ __PRETTY_FUNCTION__))
3619 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3619, __extension__ __PRETTY_FUNCTION__))
;
3620 // This means that we are in special case where one argument was
3621 // passed through two register locations - Skip the next location
3622 ++I;
3623 }
3624
3625 continue;
3626 }
3627
3628 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3628, __extension__ __PRETTY_FUNCTION__))
;
3629 SDValue Arg = OutVals[OutsIndex];
3630 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3631 // Skip inalloca arguments. They don't require any work.
3632 if (Flags.isInAlloca())
3633 continue;
3634 // Create frame index.
3635 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3636 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3637 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3638 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3639
3640 if (Flags.isByVal()) {
3641 // Copy relative to framepointer.
3642 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3643 if (!StackPtr.getNode())
3644 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3645 getPointerTy(DAG.getDataLayout()));
3646 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3647 StackPtr, Source);
3648
3649 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3650 ArgChain,
3651 Flags, DAG, dl));
3652 } else {
3653 // Store relative to framepointer.
3654 MemOpChains2.push_back(DAG.getStore(
3655 ArgChain, dl, Arg, FIN,
3656 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3657 }
3658 }
3659
3660 if (!MemOpChains2.empty())
3661 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3662
3663 // Store the return address to the appropriate stack slot.
3664 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3665 getPointerTy(DAG.getDataLayout()),
3666 RegInfo->getSlotSize(), FPDiff, dl);
3667 }
3668
3669 // Build a sequence of copy-to-reg nodes chained together with token chain
3670 // and flag operands which copy the outgoing args into registers.
3671 SDValue InFlag;
3672 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3673 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3674 RegsToPass[i].second, InFlag);
3675 InFlag = Chain.getValue(1);
3676 }
3677
3678 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3679 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3679, __extension__ __PRETTY_FUNCTION__))
;
3680 // In the 64-bit large code model, we have to make all calls
3681 // through a register, since the call instruction's 32-bit
3682 // pc-relative offset may not be large enough to hold the whole
3683 // address.
3684 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3685 // If the callee is a GlobalAddress node (quite common, every direct call
3686 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3687 // it.
3688 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3689
3690 // We should use extra load for direct calls to dllimported functions in
3691 // non-JIT mode.
3692 const GlobalValue *GV = G->getGlobal();
3693 if (!GV->hasDLLImportStorageClass()) {
3694 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3695
3696 Callee = DAG.getTargetGlobalAddress(
3697 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3698
3699 if (OpFlags == X86II::MO_GOTPCREL) {
3700 // Add a wrapper.
3701 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3702 getPointerTy(DAG.getDataLayout()), Callee);
3703 // Add extra indirection
3704 Callee = DAG.getLoad(
3705 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3706 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3707 }
3708 }
3709 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3710 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3711 unsigned char OpFlags =
3712 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3713
3714 Callee = DAG.getTargetExternalSymbol(
3715 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3716 } else if (Subtarget.isTarget64BitILP32() &&
3717 Callee->getValueType(0) == MVT::i32) {
3718 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3719 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3720 }
3721
3722 // Returns a chain & a flag for retval copy to use.
3723 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3724 SmallVector<SDValue, 8> Ops;
3725
3726 if (!IsSibcall && isTailCall) {
3727 Chain = DAG.getCALLSEQ_END(Chain,
3728 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3729 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3730 InFlag = Chain.getValue(1);
3731 }
3732
3733 Ops.push_back(Chain);
3734 Ops.push_back(Callee);
3735
3736 if (isTailCall)
3737 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3738
3739 // Add argument registers to the end of the list so that they are known live
3740 // into the call.
3741 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3742 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3743 RegsToPass[i].second.getValueType()));
3744
3745 // Add a register mask operand representing the call-preserved registers.
3746 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3747 // set X86_INTR calling convention because it has the same CSR mask
3748 // (same preserved registers).
3749 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3750 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3751 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3751, __extension__ __PRETTY_FUNCTION__))
;
3752
3753 // If this is an invoke in a 32-bit function using a funclet-based
3754 // personality, assume the function clobbers all registers. If an exception
3755 // is thrown, the runtime will not restore CSRs.
3756 // FIXME: Model this more precisely so that we can register allocate across
3757 // the normal edge and spill and fill across the exceptional edge.
3758 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3759 const Function *CallerFn = MF.getFunction();
3760 EHPersonality Pers =
3761 CallerFn->hasPersonalityFn()
3762 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3763 : EHPersonality::Unknown;
3764 if (isFuncletEHPersonality(Pers))
3765 Mask = RegInfo->getNoPreservedMask();
3766 }
3767
3768 // Define a new register mask from the existing mask.
3769 uint32_t *RegMask = nullptr;
3770
3771 // In some calling conventions we need to remove the used physical registers
3772 // from the reg mask.
3773 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3774 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3775
3776 // Allocate a new Reg Mask and copy Mask.
3777 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3778 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3779 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3780
3781 // Make sure all sub registers of the argument registers are reset
3782 // in the RegMask.
3783 for (auto const &RegPair : RegsToPass)
3784 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3785 SubRegs.isValid(); ++SubRegs)
3786 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3787
3788 // Create the RegMask Operand according to our updated mask.
3789 Ops.push_back(DAG.getRegisterMask(RegMask));
3790 } else {
3791 // Create the RegMask Operand according to the static mask.
3792 Ops.push_back(DAG.getRegisterMask(Mask));
3793 }
3794
3795 if (InFlag.getNode())
3796 Ops.push_back(InFlag);
3797
3798 if (isTailCall) {
3799 // We used to do:
3800 //// If this is the first return lowered for this function, add the regs
3801 //// to the liveout set for the function.
3802 // This isn't right, although it's probably harmless on x86; liveouts
3803 // should be computed from returns not tail calls. Consider a void
3804 // function making a tail call to a function returning int.
3805 MF.getFrameInfo().setHasTailCall();
3806 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3807 }
3808
3809 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3810 InFlag = Chain.getValue(1);
3811
3812 // Create the CALLSEQ_END node.
3813 unsigned NumBytesForCalleeToPop;
3814 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3815 DAG.getTarget().Options.GuaranteedTailCallOpt))
3816 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3817 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3818 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3819 SR == StackStructReturn)
3820 // If this is a call to a struct-return function, the callee
3821 // pops the hidden struct pointer, so we have to push it back.
3822 // This is common for Darwin/X86, Linux & Mingw32 targets.
3823 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3824 NumBytesForCalleeToPop = 4;
3825 else
3826 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3827
3828 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3829 // No need to reset the stack after the call if the call doesn't return. To
3830 // make the MI verify, we'll pretend the callee does it for us.
3831 NumBytesForCalleeToPop = NumBytes;
3832 }
3833
3834 // Returns a flag for retval copy to use.
3835 if (!IsSibcall) {
3836 Chain = DAG.getCALLSEQ_END(Chain,
3837 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3838 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3839 true),
3840 InFlag, dl);
3841 InFlag = Chain.getValue(1);
3842 }
3843
3844 // Handle result values, copying them out of physregs into vregs that we
3845 // return.
3846 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3847 InVals, RegMask);
3848}
3849
3850//===----------------------------------------------------------------------===//
3851// Fast Calling Convention (tail call) implementation
3852//===----------------------------------------------------------------------===//
3853
3854// Like std call, callee cleans arguments, convention except that ECX is
3855// reserved for storing the tail called function address. Only 2 registers are
3856// free for argument passing (inreg). Tail call optimization is performed
3857// provided:
3858// * tailcallopt is enabled
3859// * caller/callee are fastcc
3860// On X86_64 architecture with GOT-style position independent code only local
3861// (within module) calls are supported at the moment.
3862// To keep the stack aligned according to platform abi the function
3863// GetAlignedArgumentStackSize ensures that argument delta is always multiples
3864// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3865// If a tail called function callee has more arguments than the caller the
3866// caller needs to make sure that there is room to move the RETADDR to. This is
3867// achieved by reserving an area the size of the argument delta right after the
3868// original RETADDR, but before the saved framepointer or the spilled registers
3869// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3870// stack layout:
3871// arg1
3872// arg2
3873// RETADDR
3874// [ new RETADDR
3875// move area ]
3876// (possible EBP)
3877// ESI
3878// EDI
3879// local1 ..
3880
3881/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3882/// requirement.
3883unsigned
3884X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3885 SelectionDAG& DAG) const {
3886 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3887 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3888 unsigned StackAlignment = TFI.getStackAlignment();
3889 uint64_t AlignMask = StackAlignment - 1;
3890 int64_t Offset = StackSize;
3891 unsigned SlotSize = RegInfo->getSlotSize();
3892 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3893 // Number smaller than 12 so just add the difference.
3894 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3895 } else {
3896 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3897 Offset = ((~AlignMask) & Offset) + StackAlignment +
3898 (StackAlignment-SlotSize);
3899 }
3900 return Offset;
3901}
3902
3903/// Return true if the given stack call argument is already available in the
3904/// same position (relatively) of the caller's incoming argument stack.
3905static
3906bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3907 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3908 const X86InstrInfo *TII, const CCValAssign &VA) {
3909 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3910
3911 for (;;) {
3912 // Look through nodes that don't alter the bits of the incoming value.
3913 unsigned Op = Arg.getOpcode();
3914 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3915 Arg = Arg.getOperand(0);
3916 continue;
3917 }
3918 if (Op == ISD::TRUNCATE) {
3919 const SDValue &TruncInput = Arg.getOperand(0);
3920 if (TruncInput.getOpcode() == ISD::AssertZext &&
3921 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3922 Arg.getValueType()) {
3923 Arg = TruncInput.getOperand(0);
3924 continue;
3925 }
3926 }
3927 break;
3928 }
3929
3930 int FI = INT_MAX2147483647;
3931 if (Arg.getOpcode() == ISD::CopyFromReg) {
3932 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3933 if (!TargetRegisterInfo::isVirtualRegister(VR))
3934 return false;
3935 MachineInstr *Def = MRI->getVRegDef(VR);
3936 if (!Def)
3937 return false;
3938 if (!Flags.isByVal()) {
3939 if (!TII->isLoadFromStackSlot(*Def, FI))
3940 return false;
3941 } else {
3942 unsigned Opcode = Def->getOpcode();
3943 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3944 Opcode == X86::LEA64_32r) &&
3945 Def->getOperand(1).isFI()) {
3946 FI = Def->getOperand(1).getIndex();
3947 Bytes = Flags.getByValSize();
3948 } else
3949 return false;
3950 }
3951 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3952 if (Flags.isByVal())
3953 // ByVal argument is passed in as a pointer but it's now being
3954 // dereferenced. e.g.
3955 // define @foo(%struct.X* %A) {
3956 // tail call @bar(%struct.X* byval %A)
3957 // }
3958 return false;
3959 SDValue Ptr = Ld->getBasePtr();
3960 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3961 if (!FINode)
3962 return false;
3963 FI = FINode->getIndex();
3964 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3965 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3966 FI = FINode->getIndex();
3967 Bytes = Flags.getByValSize();
3968 } else
3969 return false;
3970
3971 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 3971, __extension__ __PRETTY_FUNCTION__))
;
3972 if (!MFI.isFixedObjectIndex(FI))
3973 return false;
3974
3975 if (Offset != MFI.getObjectOffset(FI))
3976 return false;
3977
3978 // If this is not byval, check that the argument stack object is immutable.
3979 // inalloca and argument copy elision can create mutable argument stack
3980 // objects. Byval objects can be mutated, but a byval call intends to pass the
3981 // mutated memory.
3982 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3983 return false;
3984
3985 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3986 // If the argument location is wider than the argument type, check that any
3987 // extension flags match.
3988 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3989 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3990 return false;
3991 }
3992 }
3993
3994 return Bytes == MFI.getObjectSize(FI);
3995}
3996
3997/// Check whether the call is eligible for tail call optimization. Targets
3998/// that want to do tail call optimization should implement this function.
3999bool X86TargetLowering::IsEligibleForTailCallOptimization(
4000 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4001 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4002 const SmallVectorImpl<ISD::OutputArg> &Outs,
4003 const SmallVectorImpl<SDValue> &OutVals,
4004 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4005 if (!mayTailCallThisCC(CalleeCC))
4006 return false;
4007
4008 // If -tailcallopt is specified, make fastcc functions tail-callable.
4009 MachineFunction &MF = DAG.getMachineFunction();
4010 const Function *CallerF = MF.getFunction();
4011
4012 // If the function return type is x86_fp80 and the callee return type is not,
4013 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4014 // perform a tailcall optimization here.
4015 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4016 return false;
4017
4018 CallingConv::ID CallerCC = CallerF->getCallingConv();
4019 bool CCMatch = CallerCC == CalleeCC;
4020 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4021 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4022
4023 // Win64 functions have extra shadow space for argument homing. Don't do the
4024 // sibcall if the caller and callee have mismatched expectations for this
4025 // space.
4026 if (IsCalleeWin64 != IsCallerWin64)
4027 return false;
4028
4029 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4030 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4031 return true;
4032 return false;
4033 }
4034
4035 // Look for obvious safe cases to perform tail call optimization that do not
4036 // require ABI changes. This is what gcc calls sibcall.
4037
4038 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4039 // emit a special epilogue.
4040 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4041 if (RegInfo->needsStackRealignment(MF))
4042 return false;
4043
4044 // Also avoid sibcall optimization if either caller or callee uses struct
4045 // return semantics.
4046 if (isCalleeStructRet || isCallerStructRet)
4047 return false;
4048
4049 // Do not sibcall optimize vararg calls unless all arguments are passed via
4050 // registers.
4051 LLVMContext &C = *DAG.getContext();
4052 if (isVarArg && !Outs.empty()) {
4053 // Optimizing for varargs on Win64 is unlikely to be safe without
4054 // additional testing.
4055 if (IsCalleeWin64 || IsCallerWin64)
4056 return false;
4057
4058 SmallVector<CCValAssign, 16> ArgLocs;
4059 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4060
4061 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4062 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4063 if (!ArgLocs[i].isRegLoc())
4064 return false;
4065 }
4066
4067 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4068 // stack. Therefore, if it's not used by the call it is not safe to optimize
4069 // this into a sibcall.
4070 bool Unused = false;
4071 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4072 if (!Ins[i].Used) {
4073 Unused = true;
4074 break;
4075 }
4076 }
4077 if (Unused) {
4078 SmallVector<CCValAssign, 16> RVLocs;
4079 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4080 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4081 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4082 CCValAssign &VA = RVLocs[i];
4083 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4084 return false;
4085 }
4086 }
4087
4088 // Check that the call results are passed in the same way.
4089 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4090 RetCC_X86, RetCC_X86))
4091 return false;
4092 // The callee has to preserve all registers the caller needs to preserve.
4093 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4094 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4095 if (!CCMatch) {
4096 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4097 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4098 return false;
4099 }
4100
4101 unsigned StackArgsSize = 0;
4102
4103 // If the callee takes no arguments then go on to check the results of the
4104 // call.
4105 if (!Outs.empty()) {
4106 // Check if stack adjustment is needed. For now, do not do this if any
4107 // argument is passed on the stack.
4108 SmallVector<CCValAssign, 16> ArgLocs;
4109 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4110
4111 // Allocate shadow area for Win64
4112 if (IsCalleeWin64)
4113 CCInfo.AllocateStack(32, 8);
4114
4115 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4116 StackArgsSize = CCInfo.getNextStackOffset();
4117
4118 if (CCInfo.getNextStackOffset()) {
4119 // Check if the arguments are already laid out in the right way as
4120 // the caller's fixed stack objects.
4121 MachineFrameInfo &MFI = MF.getFrameInfo();
4122 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4123 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4124 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4125 CCValAssign &VA = ArgLocs[i];
4126 SDValue Arg = OutVals[i];
4127 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4128 if (VA.getLocInfo() == CCValAssign::Indirect)
4129 return false;
4130 if (!VA.isRegLoc()) {
4131 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4132 MFI, MRI, TII, VA))
4133 return false;
4134 }
4135 }
4136 }
4137
4138 bool PositionIndependent = isPositionIndependent();
4139 // If the tailcall address may be in a register, then make sure it's
4140 // possible to register allocate for it. In 32-bit, the call address can
4141 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4142 // callee-saved registers are restored. These happen to be the same
4143 // registers used to pass 'inreg' arguments so watch out for those.
4144 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4145 !isa<ExternalSymbolSDNode>(Callee)) ||
4146 PositionIndependent)) {
4147 unsigned NumInRegs = 0;
4148 // In PIC we need an extra register to formulate the address computation
4149 // for the callee.
4150 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4151
4152 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4153 CCValAssign &VA = ArgLocs[i];
4154 if (!VA.isRegLoc())
4155 continue;
4156 unsigned Reg = VA.getLocReg();
4157 switch (Reg) {
4158 default: break;
4159 case X86::EAX: case X86::EDX: case X86::ECX:
4160 if (++NumInRegs == MaxInRegs)
4161 return false;
4162 break;
4163 }
4164 }
4165 }
4166
4167 const MachineRegisterInfo &MRI = MF.getRegInfo();
4168 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4169 return false;
4170 }
4171
4172 bool CalleeWillPop =
4173 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4174 MF.getTarget().Options.GuaranteedTailCallOpt);
4175
4176 if (unsigned BytesToPop =
4177 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4178 // If we have bytes to pop, the callee must pop them.
4179 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4180 if (!CalleePopMatches)
4181 return false;
4182 } else if (CalleeWillPop && StackArgsSize > 0) {
4183 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4184 return false;
4185 }
4186
4187 return true;
4188}
4189
4190FastISel *
4191X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4192 const TargetLibraryInfo *libInfo) const {
4193 return X86::createFastISel(funcInfo, libInfo);
4194}
4195
4196//===----------------------------------------------------------------------===//
4197// Other Lowering Hooks
4198//===----------------------------------------------------------------------===//
4199
4200static bool MayFoldLoad(SDValue Op) {
4201 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4202}
4203
4204static bool MayFoldIntoStore(SDValue Op) {
4205 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4206}
4207
4208static bool MayFoldIntoZeroExtend(SDValue Op) {
4209 if (Op.hasOneUse()) {
4210 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4211 return (ISD::ZERO_EXTEND == Opcode);
4212 }
4213 return false;
4214}
4215
4216static bool isTargetShuffle(unsigned Opcode) {
4217 switch(Opcode) {
4218 default: return false;
4219 case X86ISD::BLENDI:
4220 case X86ISD::PSHUFB:
4221 case X86ISD::PSHUFD:
4222 case X86ISD::PSHUFHW:
4223 case X86ISD::PSHUFLW:
4224 case X86ISD::SHUFP:
4225 case X86ISD::INSERTPS:
4226 case X86ISD::EXTRQI:
4227 case X86ISD::INSERTQI:
4228 case X86ISD::PALIGNR:
4229 case X86ISD::VSHLDQ:
4230 case X86ISD::VSRLDQ:
4231 case X86ISD::MOVLHPS:
4232 case X86ISD::MOVHLPS:
4233 case X86ISD::MOVLPS:
4234 case X86ISD::MOVLPD:
4235 case X86ISD::MOVSHDUP:
4236 case X86ISD::MOVSLDUP:
4237 case X86ISD::MOVDDUP:
4238 case X86ISD::MOVSS:
4239 case X86ISD::MOVSD:
4240 case X86ISD::UNPCKL:
4241 case X86ISD::UNPCKH:
4242 case X86ISD::VBROADCAST:
4243 case X86ISD::VPERMILPI:
4244 case X86ISD::VPERMILPV:
4245 case X86ISD::VPERM2X128:
4246 case X86ISD::VPERMIL2:
4247 case X86ISD::VPERMI:
4248 case X86ISD::VPPERM:
4249 case X86ISD::VPERMV:
4250 case X86ISD::VPERMV3:
4251 case X86ISD::VPERMIV3:
4252 case X86ISD::VZEXT_MOVL:
4253 return true;
4254 }
4255}
4256
4257static bool isTargetShuffleVariableMask(unsigned Opcode) {
4258 switch (Opcode) {
4259 default: return false;
4260 // Target Shuffles.
4261 case X86ISD::PSHUFB:
4262 case X86ISD::VPERMILPV:
4263 case X86ISD::VPERMIL2:
4264 case X86ISD::VPPERM:
4265 case X86ISD::VPERMV:
4266 case X86ISD::VPERMV3:
4267 case X86ISD::VPERMIV3:
4268 return true;
4269 // 'Faux' Target Shuffles.
4270 case ISD::AND:
4271 case X86ISD::ANDNP:
4272 return true;
4273 }
4274}
4275
4276SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4277 MachineFunction &MF = DAG.getMachineFunction();
4278 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4279 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4280 int ReturnAddrIndex = FuncInfo->getRAIndex();
4281
4282 if (ReturnAddrIndex == 0) {
4283 // Set up a frame object for the return address.
4284 unsigned SlotSize = RegInfo->getSlotSize();
4285 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4286 -(int64_t)SlotSize,
4287 false);
4288 FuncInfo->setRAIndex(ReturnAddrIndex);
4289 }
4290
4291 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4292}
4293
4294bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4295 bool hasSymbolicDisplacement) {
4296 // Offset should fit into 32 bit immediate field.
4297 if (!isInt<32>(Offset))
4298 return false;
4299
4300 // If we don't have a symbolic displacement - we don't have any extra
4301 // restrictions.
4302 if (!hasSymbolicDisplacement)
4303 return true;
4304
4305 // FIXME: Some tweaks might be needed for medium code model.
4306 if (M != CodeModel::Small && M != CodeModel::Kernel)
4307 return false;
4308
4309 // For small code model we assume that latest object is 16MB before end of 31
4310 // bits boundary. We may also accept pretty large negative constants knowing
4311 // that all objects are in the positive half of address space.
4312 if (M == CodeModel::Small && Offset < 16*1024*1024)
4313 return true;
4314
4315 // For kernel code model we know that all object resist in the negative half
4316 // of 32bits address space. We may not accept negative offsets, since they may
4317 // be just off and we may accept pretty large positive ones.
4318 if (M == CodeModel::Kernel && Offset >= 0)
4319 return true;
4320
4321 return false;
4322}
4323
4324/// Determines whether the callee is required to pop its own arguments.
4325/// Callee pop is necessary to support tail calls.
4326bool X86::isCalleePop(CallingConv::ID CallingConv,
4327 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4328 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4329 // can guarantee TCO.
4330 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4331 return true;
4332
4333 switch (CallingConv) {
4334 default:
4335 return false;
4336 case CallingConv::X86_StdCall:
4337 case CallingConv::X86_FastCall:
4338 case CallingConv::X86_ThisCall:
4339 case CallingConv::X86_VectorCall:
4340 return !is64Bit;
4341 }
4342}
4343
4344/// \brief Return true if the condition is an unsigned comparison operation.
4345static bool isX86CCUnsigned(unsigned X86CC) {
4346 switch (X86CC) {
4347 default:
4348 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4348)
;
4349 case X86::COND_E:
4350 case X86::COND_NE:
4351 case X86::COND_B:
4352 case X86::COND_A:
4353 case X86::COND_BE:
4354 case X86::COND_AE:
4355 return true;
4356 case X86::COND_G:
4357 case X86::COND_GE:
4358 case X86::COND_L:
4359 case X86::COND_LE:
4360 return false;
4361 }
4362}
4363
4364static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4365 switch (SetCCOpcode) {
4366 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4366)
;
4367 case ISD::SETEQ: return X86::COND_E;
4368 case ISD::SETGT: return X86::COND_G;
4369 case ISD::SETGE: return X86::COND_GE;
4370 case ISD::SETLT: return X86::COND_L;
4371 case ISD::SETLE: return X86::COND_LE;
4372 case ISD::SETNE: return X86::COND_NE;
4373 case ISD::SETULT: return X86::COND_B;
4374 case ISD::SETUGT: return X86::COND_A;
4375 case ISD::SETULE: return X86::COND_BE;
4376 case ISD::SETUGE: return X86::COND_AE;
4377 }
4378}
4379
4380/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4381/// condition code, returning the condition code and the LHS/RHS of the
4382/// comparison to make.
4383static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4384 bool isFP, SDValue &LHS, SDValue &RHS,
4385 SelectionDAG &DAG) {
4386 if (!isFP) {
4387 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4388 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4389 // X > -1 -> X == 0, jump !sign.
4390 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4391 return X86::COND_NS;
4392 }
4393 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4394 // X < 0 -> X == 0, jump on sign.
4395 return X86::COND_S;
4396 }
4397 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4398 // X < 1 -> X <= 0
4399 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4400 return X86::COND_LE;
4401 }
4402 }
4403
4404 return TranslateIntegerX86CC(SetCCOpcode);
4405 }
4406
4407 // First determine if it is required or is profitable to flip the operands.
4408
4409 // If LHS is a foldable load, but RHS is not, flip the condition.
4410 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4411 !ISD::isNON_EXTLoad(RHS.getNode())) {
4412 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4413 std::swap(LHS, RHS);
4414 }
4415
4416 switch (SetCCOpcode) {
4417 default: break;
4418 case ISD::SETOLT:
4419 case ISD::SETOLE:
4420 case ISD::SETUGT:
4421 case ISD::SETUGE:
4422 std::swap(LHS, RHS);
4423 break;
4424 }
4425
4426 // On a floating point condition, the flags are set as follows:
4427 // ZF PF CF op
4428 // 0 | 0 | 0 | X > Y
4429 // 0 | 0 | 1 | X < Y
4430 // 1 | 0 | 0 | X == Y
4431 // 1 | 1 | 1 | unordered
4432 switch (SetCCOpcode) {
4433 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4433)
;
4434 case ISD::SETUEQ:
4435 case ISD::SETEQ: return X86::COND_E;
4436 case ISD::SETOLT: // flipped
4437 case ISD::SETOGT:
4438 case ISD::SETGT: return X86::COND_A;
4439 case ISD::SETOLE: // flipped
4440 case ISD::SETOGE:
4441 case ISD::SETGE: return X86::COND_AE;
4442 case ISD::SETUGT: // flipped
4443 case ISD::SETULT:
4444 case ISD::SETLT: return X86::COND_B;
4445 case ISD::SETUGE: // flipped
4446 case ISD::SETULE:
4447 case ISD::SETLE: return X86::COND_BE;
4448 case ISD::SETONE:
4449 case ISD::SETNE: return X86::COND_NE;
4450 case ISD::SETUO: return X86::COND_P;
4451 case ISD::SETO: return X86::COND_NP;
4452 case ISD::SETOEQ:
4453 case ISD::SETUNE: return X86::COND_INVALID;
4454 }
4455}
4456
4457/// Is there a floating point cmov for the specific X86 condition code?
4458/// Current x86 isa includes the following FP cmov instructions:
4459/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4460static bool hasFPCMov(unsigned X86CC) {
4461 switch (X86CC) {
4462 default:
4463 return false;
4464 case X86::COND_B:
4465 case X86::COND_BE:
4466 case X86::COND_E:
4467 case X86::COND_P:
4468 case X86::COND_A:
4469 case X86::COND_AE:
4470 case X86::COND_NE:
4471 case X86::COND_NP:
4472 return true;
4473 }
4474}
4475
4476
4477bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4478 const CallInst &I,
4479 unsigned Intrinsic) const {
4480
4481 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4482 if (!IntrData)
4483 return false;
4484
4485 Info.opc = ISD::INTRINSIC_W_CHAIN;
4486 Info.readMem = false;
4487 Info.writeMem = false;
4488 Info.vol = false;
4489 Info.offset = 0;
4490
4491 switch (IntrData->Type) {
4492 case EXPAND_FROM_MEM: {
4493 Info.ptrVal = I.getArgOperand(0);
4494 Info.memVT = MVT::getVT(I.getType());
4495 Info.align = 1;
4496 Info.readMem = true;
4497 break;
4498 }
4499 case COMPRESS_TO_MEM: {
4500 Info.ptrVal = I.getArgOperand(0);
4501 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4502 Info.align = 1;
4503 Info.writeMem = true;
4504 break;
4505 }
4506 case TRUNCATE_TO_MEM_VI8:
4507 case TRUNCATE_TO_MEM_VI16:
4508 case TRUNCATE_TO_MEM_VI32: {
4509 Info.ptrVal = I.getArgOperand(0);
4510 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4511 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4512 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4513 ScalarVT = MVT::i8;
4514 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4515 ScalarVT = MVT::i16;
4516 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4517 ScalarVT = MVT::i32;
4518
4519 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4520 Info.align = 1;
4521 Info.writeMem = true;
4522 break;
4523 }
4524 default:
4525 return false;
4526 }
4527
4528 return true;
4529}
4530
4531/// Returns true if the target can instruction select the
4532/// specified FP immediate natively. If false, the legalizer will
4533/// materialize the FP immediate as a load from a constant pool.
4534bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4535 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4536 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4537 return true;
4538 }
4539 return false;
4540}
4541
4542bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4543 ISD::LoadExtType ExtTy,
4544 EVT NewVT) const {
4545 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4546 // relocation target a movq or addq instruction: don't let the load shrink.
4547 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4548 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4549 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4550 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4551 return true;
4552}
4553
4554/// \brief Returns true if it is beneficial to convert a load of a constant
4555/// to just the constant itself.
4556bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4557 Type *Ty) const {
4558 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4558, __extension__ __PRETTY_FUNCTION__))
;
4559
4560 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4561 if (BitSize == 0 || BitSize > 64)
4562 return false;
4563 return true;
4564}
4565
4566bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4567 // TODO: It might be a win to ease or lift this restriction, but the generic
4568 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4569 if (VT.isVector() && Subtarget.hasAVX512())
4570 return false;
4571
4572 return true;
4573}
4574
4575bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4576 unsigned Index) const {
4577 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4578 return false;
4579
4580 // Mask vectors support all subregister combinations and operations that
4581 // extract half of vector.
4582 if (ResVT.getVectorElementType() == MVT::i1)
4583 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4584 (Index == ResVT.getVectorNumElements()));
4585
4586 return (Index % ResVT.getVectorNumElements()) == 0;
4587}
4588
4589bool X86TargetLowering::isCheapToSpeculateCttz() const {
4590 // Speculate cttz only if we can directly use TZCNT.
4591 return Subtarget.hasBMI();
4592}
4593
4594bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4595 // Speculate ctlz only if we can directly use LZCNT.
4596 return Subtarget.hasLZCNT();
4597}
4598
4599bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4600 const SelectionDAG &DAG) const {
4601 // Do not merge to float value size (128 bytes) if no implicit
4602 // float attribute is set.
4603 bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
4604 Attribute::NoImplicitFloat);
4605
4606 if (NoFloat) {
4607 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4608 return (MemVT.getSizeInBits() <= MaxIntSize);
4609 }
4610 return true;
4611}
4612
4613bool X86TargetLowering::isCtlzFast() const {
4614 return Subtarget.hasFastLZCNT();
4615}
4616
4617bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4618 const Instruction &AndI) const {
4619 return true;
4620}
4621
4622bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4623 if (!Subtarget.hasBMI())
4624 return false;
4625
4626 // There are only 32-bit and 64-bit forms for 'andn'.
4627 EVT VT = Y.getValueType();
4628 if (VT != MVT::i32 && VT != MVT::i64)
4629 return false;
4630
4631 return true;
4632}
4633
4634MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4635 MVT VT = MVT::getIntegerVT(NumBits);
4636 if (isTypeLegal(VT))
4637 return VT;
4638
4639 // PMOVMSKB can handle this.
4640 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4641 return MVT::v16i8;
4642
4643 // VPMOVMSKB can handle this.
4644 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4645 return MVT::v32i8;
4646
4647 // TODO: Allow 64-bit type for 32-bit target.
4648 // TODO: 512-bit types should be allowed, but make sure that those
4649 // cases are handled in combineVectorSizedSetCCEquality().
4650
4651 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4652}
4653
4654/// Val is the undef sentinel value or equal to the specified value.
4655static bool isUndefOrEqual(int Val, int CmpVal) {
4656 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4657}
4658
4659/// Val is either the undef or zero sentinel value.
4660static bool isUndefOrZero(int Val) {
4661 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4662}
4663
4664/// Return true if every element in Mask, beginning
4665/// from position Pos and ending in Pos+Size is the undef sentinel value.
4666static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4667 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4668 if (Mask[i] != SM_SentinelUndef)
4669 return false;
4670 return true;
4671}
4672
4673/// Return true if Val is undef or if its value falls within the
4674/// specified range (L, H].
4675static bool isUndefOrInRange(int Val, int Low, int Hi) {
4676 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4677}
4678
4679/// Return true if every element in Mask is undef or if its value
4680/// falls within the specified range (L, H].
4681static bool isUndefOrInRange(ArrayRef<int> Mask,
4682 int Low, int Hi) {
4683 for (int M : Mask)
4684 if (!isUndefOrInRange(M, Low, Hi))
4685 return false;
4686 return true;
4687}
4688
4689/// Return true if Val is undef, zero or if its value falls within the
4690/// specified range (L, H].
4691static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4692 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4693}
4694
4695/// Return true if every element in Mask is undef, zero or if its value
4696/// falls within the specified range (L, H].
4697static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4698 for (int M : Mask)
4699 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4700 return false;
4701 return true;
4702}
4703
4704/// Return true if every element in Mask, beginning
4705/// from position Pos and ending in Pos+Size, falls within the specified
4706/// sequential range (Low, Low+Size]. or is undef.
4707static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4708 unsigned Pos, unsigned Size, int Low) {
4709 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4710 if (!isUndefOrEqual(Mask[i], Low))
4711 return false;
4712 return true;
4713}
4714
4715/// Return true if every element in Mask, beginning
4716/// from position Pos and ending in Pos+Size, falls within the specified
4717/// sequential range (Low, Low+Size], or is undef or is zero.
4718static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4719 unsigned Size, int Low) {
4720 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4721 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4722 return false;
4723 return true;
4724}
4725
4726/// Return true if every element in Mask, beginning
4727/// from position Pos and ending in Pos+Size is undef or is zero.
4728static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4729 unsigned Size) {
4730 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4731 if (!isUndefOrZero(Mask[i]))
4732 return false;
4733 return true;
4734}
4735
4736/// \brief Helper function to test whether a shuffle mask could be
4737/// simplified by widening the elements being shuffled.
4738///
4739/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4740/// leaves it in an unspecified state.
4741///
4742/// NOTE: This must handle normal vector shuffle masks and *target* vector
4743/// shuffle masks. The latter have the special property of a '-2' representing
4744/// a zero-ed lane of a vector.
4745static bool canWidenShuffleElements(ArrayRef<int> Mask,
4746 SmallVectorImpl<int> &WidenedMask) {
4747 WidenedMask.assign(Mask.size() / 2, 0);
4748 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4749 int M0 = Mask[i];
4750 int M1 = Mask[i + 1];
4751
4752 // If both elements are undef, its trivial.
4753 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4754 WidenedMask[i / 2] = SM_SentinelUndef;
4755 continue;
4756 }
4757
4758 // Check for an undef mask and a mask value properly aligned to fit with
4759 // a pair of values. If we find such a case, use the non-undef mask's value.
4760 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4761 WidenedMask[i / 2] = M1 / 2;
4762 continue;
4763 }
4764 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4765 WidenedMask[i / 2] = M0 / 2;
4766 continue;
4767 }
4768
4769 // When zeroing, we need to spread the zeroing across both lanes to widen.
4770 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4771 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4772 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4773 WidenedMask[i / 2] = SM_SentinelZero;
4774 continue;
4775 }
4776 return false;
4777 }
4778
4779 // Finally check if the two mask values are adjacent and aligned with
4780 // a pair.
4781 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4782 WidenedMask[i / 2] = M0 / 2;
4783 continue;
4784 }
4785
4786 // Otherwise we can't safely widen the elements used in this shuffle.
4787 return false;
4788 }
4789 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4790, __extension__ __PRETTY_FUNCTION__))
4790 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4790, __extension__ __PRETTY_FUNCTION__))
;
4791
4792 return true;
4793}
4794
4795/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4796bool X86::isZeroNode(SDValue Elt) {
4797 return isNullConstant(Elt) || isNullFPConstant(Elt);
4798}
4799
4800// Build a vector of constants.
4801// Use an UNDEF node if MaskElt == -1.
4802// Split 64-bit constants in the 32-bit mode.
4803static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4804 const SDLoc &dl, bool IsMask = false) {
4805
4806 SmallVector<SDValue, 32> Ops;
4807 bool Split = false;
4808
4809 MVT ConstVecVT = VT;
4810 unsigned NumElts = VT.getVectorNumElements();
4811 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4812 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4813 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4814 Split = true;
4815 }
4816
4817 MVT EltVT = ConstVecVT.getVectorElementType();
4818 for (unsigned i = 0; i < NumElts; ++i) {
4819 bool IsUndef = Values[i] < 0 && IsMask;
4820 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4821 DAG.getConstant(Values[i], dl, EltVT);
4822 Ops.push_back(OpNode);
4823 if (Split)
4824 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4825 DAG.getConstant(0, dl, EltVT));
4826 }
4827 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4828 if (Split)
4829 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4830 return ConstsNode;
4831}
4832
4833static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4834 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4835 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4836, __extension__ __PRETTY_FUNCTION__))
4836 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4836, __extension__ __PRETTY_FUNCTION__))
;
4837 SmallVector<SDValue, 32> Ops;
4838 bool Split = false;
4839
4840 MVT ConstVecVT = VT;
4841 unsigned NumElts = VT.getVectorNumElements();
4842 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4843 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4844 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4845 Split = true;
4846 }
4847
4848 MVT EltVT = ConstVecVT.getVectorElementType();
4849 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4850 if (Undefs[i]) {
4851 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4852 continue;
4853 }
4854 const APInt &V = Bits[i];
4855 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4855, __extension__ __PRETTY_FUNCTION__))
;
4856 if (Split) {
4857 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4858 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4859 } else if (EltVT == MVT::f32) {
4860 APFloat FV(APFloat::IEEEsingle(), V);
4861 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4862 } else if (EltVT == MVT::f64) {
4863 APFloat FV(APFloat::IEEEdouble(), V);
4864 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4865 } else {
4866 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4867 }
4868 }
4869
4870 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4871 return DAG.getBitcast(VT, ConstsNode);
4872}
4873
4874/// Returns a vector of specified type with all zero elements.
4875static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4876 SelectionDAG &DAG, const SDLoc &dl) {
4877 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__))
4878 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__))
4879 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4879, __extension__ __PRETTY_FUNCTION__))
;
4880
4881 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4882 // type. This ensures they get CSE'd. But if the integer type is not
4883 // available, use a floating-point +0.0 instead.
4884 SDValue Vec;
4885 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4886 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4887 } else if (VT.getVectorElementType() == MVT::i1) {
4888 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4889, __extension__ __PRETTY_FUNCTION__))
4889 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4889, __extension__ __PRETTY_FUNCTION__))
;
4890 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&(static_cast <bool> ((Subtarget.hasVLX() || VT.getVectorNumElements
() >= 8) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4891, __extension__ __PRETTY_FUNCTION__))
4891 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasVLX() || VT.getVectorNumElements
() >= 8) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4891, __extension__ __PRETTY_FUNCTION__))
;
4892 Vec = DAG.getConstant(0, dl, VT);
4893 } else {
4894 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4895 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4896 }
4897 return DAG.getBitcast(VT, Vec);
4898}
4899
4900static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4901 const SDLoc &dl, unsigned vectorWidth) {
4902 EVT VT = Vec.getValueType();
4903 EVT ElVT = VT.getVectorElementType();
4904 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4905 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4906 VT.getVectorNumElements()/Factor);
4907
4908 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4909 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4910 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4910, __extension__ __PRETTY_FUNCTION__))
;
4911
4912 // This is the index of the first element of the vectorWidth-bit chunk
4913 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4914 IdxVal &= ~(ElemsPerChunk - 1);
4915
4916 // If the input is a buildvector just emit a smaller one.
4917 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4918 return DAG.getBuildVector(ResultVT, dl,
4919 Vec->ops().slice(IdxVal, ElemsPerChunk));
4920
4921 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4923}
4924
4925/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4926/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4927/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4928/// instructions or a simple subregister reference. Idx is an index in the
4929/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4930/// lowering EXTRACT_VECTOR_ELT operations easier.
4931static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4932 SelectionDAG &DAG, const SDLoc &dl) {
4933 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4934, __extension__ __PRETTY_FUNCTION__))
4934 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4934, __extension__ __PRETTY_FUNCTION__))
;
4935 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4936}
4937
4938/// Generate a DAG to grab 256-bits from a 512-bit vector.
4939static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4940 SelectionDAG &DAG, const SDLoc &dl) {
4941 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4941, __extension__ __PRETTY_FUNCTION__))
;
4942 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4943}
4944
4945static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4946 SelectionDAG &DAG, const SDLoc &dl,
4947 unsigned vectorWidth) {
4948 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __extension__ __PRETTY_FUNCTION__))
4949 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4949, __extension__ __PRETTY_FUNCTION__))
;
4950 // Inserting UNDEF is Result
4951 if (Vec.isUndef())
4952 return Result;
4953 EVT VT = Vec.getValueType();
4954 EVT ElVT = VT.getVectorElementType();
4955 EVT ResultVT = Result.getValueType();
4956
4957 // Insert the relevant vectorWidth bits.
4958 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4959 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4959, __extension__ __PRETTY_FUNCTION__))
;
4960
4961 // This is the index of the first element of the vectorWidth-bit chunk
4962 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4963 IdxVal &= ~(ElemsPerChunk - 1);
4964
4965 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4966 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4967}
4968
4969/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4970/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4971/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4972/// simple superregister reference. Idx is an index in the 128 bits
4973/// we want. It need not be aligned to a 128-bit boundary. That makes
4974/// lowering INSERT_VECTOR_ELT operations easier.
4975static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4976 SelectionDAG &DAG, const SDLoc &dl) {
4977 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4977, __extension__ __PRETTY_FUNCTION__))
;
4978 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4979}
4980
4981static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4982 SelectionDAG &DAG, const SDLoc &dl) {
4983 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is256BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is256BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 4983, __extension__ __PRETTY_FUNCTION__))
;
4984 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4985}
4986
4987// Return true if the instruction zeroes the unused upper part of the
4988// destination and accepts mask.
4989static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
4990 switch (Opcode) {
4991 default:
4992 return false;
4993 case X86ISD::TESTM:
4994 case X86ISD::TESTNM:
4995 case X86ISD::PCMPEQM:
4996 case X86ISD::PCMPGTM:
4997 case X86ISD::CMPM:
4998 case X86ISD::CMPMU:
4999 case X86ISD::CMPM_RND:
5000 return true;
5001 }
5002}
5003
5004/// Insert i1-subvector to i1-vector.
5005static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5006 const X86Subtarget &Subtarget) {
5007
5008 SDLoc dl(Op);
5009 SDValue Vec = Op.getOperand(0);
5010 SDValue SubVec = Op.getOperand(1);
5011 SDValue Idx = Op.getOperand(2);
5012
5013 if (!isa<ConstantSDNode>(Idx))
5014 return SDValue();
5015
5016 // Inserting undef is a nop. We can just return the original vector.
5017 if (SubVec.isUndef())
5018 return Vec;
5019
5020 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5021 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5022 return Op;
5023
5024 MVT OpVT = Op.getSimpleValueType();
5025 unsigned NumElems = OpVT.getVectorNumElements();
5026
5027 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5028
5029 // Extend to natively supported kshift.
5030 MVT WideOpVT = OpVT;
5031 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5032 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5033
5034 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5035 // if necessary.
5036 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5037 // May need to promote to a legal type.
5038 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5039 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5040 SubVec, Idx);
5041 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5042 }
5043
5044 MVT SubVecVT = SubVec.getSimpleValueType();
5045 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5046
5047 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__))
5048 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__))
5049 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5049, __extension__ __PRETTY_FUNCTION__))
;
5050
5051 SDValue Undef = DAG.getUNDEF(WideOpVT);
5052
5053 if (IdxVal == 0) {
5054 // Zero lower bits of the Vec
5055 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5056 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5057 ZeroIdx);
5058 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5059 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5060 // Merge them together, SubVec should be zero extended.
5061 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5062 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5063 SubVec, ZeroIdx);
5064 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5065 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op,
5066 ZeroIdx);
5067 }
5068
5069 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5070 Undef, SubVec, ZeroIdx);
5071
5072 if (Vec.isUndef()) {
5073 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5073, __extension__ __PRETTY_FUNCTION__))
;
5074 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5075 DAG.getConstant(IdxVal, dl, MVT::i8));
5076 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5077 }
5078
5079 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5080 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5080, __extension__ __PRETTY_FUNCTION__))
;
5081 NumElems = WideOpVT.getVectorNumElements();
5082 unsigned ShiftLeft = NumElems - SubVecNumElems;
5083 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5084 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5085 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5086 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5087 DAG.getConstant(ShiftRight, dl, MVT::i8));
5088 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5089 }
5090
5091 // Simple case when we put subvector in the upper part
5092 if (IdxVal + SubVecNumElems == NumElems) {
5093 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5094 DAG.getConstant(IdxVal, dl, MVT::i8));
5095 if (SubVecNumElems * 2 == NumElems) {
5096 // Special case, use legal zero extending insert_subvector. This allows
5097 // isel to opimitize when bits are known zero.
5098 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5099 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5100 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5101 Vec, ZeroIdx);
5102 } else {
5103 // Otherwise use explicit shifts to zero the bits.
5104 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5105 Undef, Vec, ZeroIdx);
5106 NumElems = WideOpVT.getVectorNumElements();
5107 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5108 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5109 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5110 }
5111 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5112 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5113 }
5114
5115 // Inserting into the middle is more complicated.
5116
5117 NumElems = WideOpVT.getVectorNumElements();
5118
5119 // Widen the vector if needed.
5120 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5121 // Move the current value of the bit to be replace to the lsbs.
5122 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5123 DAG.getConstant(IdxVal, dl, MVT::i8));
5124 // Xor with the new bit.
5125 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5126 // Shift to MSB, filling bottom bits with 0.
5127 unsigned ShiftLeft = NumElems - SubVecNumElems;
5128 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5129 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5130 // Shift to the final position, filling upper bits with 0.
5131 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5132 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5133 DAG.getConstant(ShiftRight, dl, MVT::i8));
5134 // Xor with original vector leaving the new value.
5135 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5136 // Reduce to original width if needed.
5137 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5138}
5139
5140/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5141/// instructions. This is used because creating CONCAT_VECTOR nodes of
5142/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5143/// large BUILD_VECTORS.
5144static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5145 unsigned NumElems, SelectionDAG &DAG,
5146 const SDLoc &dl) {
5147 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5148 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5149}
5150
5151static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5152 unsigned NumElems, SelectionDAG &DAG,
5153 const SDLoc &dl) {
5154 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5155 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5156}
5157
5158/// Returns a vector of specified type with all bits set.
5159/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5160/// Then bitcast to their original type, ensuring they get CSE'd.
5161static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5162 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5163, __extension__ __PRETTY_FUNCTION__))
5163 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5163, __extension__ __PRETTY_FUNCTION__))
;
5164
5165 APInt Ones = APInt::getAllOnesValue(32);
5166 unsigned NumElts = VT.getSizeInBits() / 32;
5167 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5168 return DAG.getBitcast(VT, Vec);
5169}
5170
5171static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5172 SelectionDAG &DAG) {
5173 EVT InVT = In.getValueType();
5174 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode")(static_cast <bool> ((X86ISD::VSEXT == Opc || X86ISD::VZEXT
== Opc) && "Unexpected opcode") ? void (0) : __assert_fail
("(X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5174, __extension__ __PRETTY_FUNCTION__))
;
5175
5176 if (VT.is128BitVector() && InVT.is128BitVector())
5177 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5178 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5179
5180 // For 256-bit vectors, we only need the lower (128-bit) input half.
5181 // For 512-bit vectors, we only need the lower input half or quarter.
5182 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5183 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5184 In = extractSubVector(In, 0, DAG, DL,
5185 std::max(128, (int)VT.getSizeInBits() / Scale));
5186 }
5187
5188 return DAG.getNode(Opc, DL, VT, In);
5189}
5190
5191/// Returns a vector_shuffle node for an unpackl operation.
5192static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5193 SDValue V1, SDValue V2) {
5194 SmallVector<int, 8> Mask;
5195 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5196 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5197}
5198
5199/// Returns a vector_shuffle node for an unpackh operation.
5200static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5201 SDValue V1, SDValue V2) {
5202 SmallVector<int, 8> Mask;
5203 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5204 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5205}
5206
5207/// Return a vector_shuffle of the specified vector of zero or undef vector.
5208/// This produces a shuffle where the low element of V2 is swizzled into the
5209/// zero/undef vector, landing at element Idx.
5210/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5211static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5212 bool IsZero,
5213 const X86Subtarget &Subtarget,
5214 SelectionDAG &DAG) {
5215 MVT VT = V2.getSimpleValueType();
5216 SDValue V1 = IsZero
5217 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5218 int NumElems = VT.getVectorNumElements();
5219 SmallVector<int, 16> MaskVec(NumElems);
5220 for (int i = 0; i != NumElems; ++i)
5221 // If this is the insertion idx, put the low elt of V2 here.
5222 MaskVec[i] = (i == Idx) ? NumElems : i;
5223 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5224}
5225
5226static SDValue peekThroughBitcasts(SDValue V) {
5227 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5228 V = V.getOperand(0);
5229 return V;
5230}
5231
5232static SDValue peekThroughOneUseBitcasts(SDValue V) {
5233 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5234 V.getOperand(0).hasOneUse())
5235 V = V.getOperand(0);
5236 return V;
5237}
5238
5239static const Constant *getTargetConstantFromNode(SDValue Op) {
5240 Op = peekThroughBitcasts(Op);
5241
5242 auto *Load = dyn_cast<LoadSDNode>(Op);
5243 if (!Load)
5244 return nullptr;
5245
5246 SDValue Ptr = Load->getBasePtr();
5247 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5248 Ptr->getOpcode() == X86ISD::WrapperRIP)
5249 Ptr = Ptr->getOperand(0);
5250
5251 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5252 if (!CNode || CNode->isMachineConstantPoolEntry())
5253 return nullptr;
5254
5255 return dyn_cast<Constant>(CNode->getConstVal());
5256}
5257
5258// Extract raw constant bits from constant pools.
5259static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5260 APInt &UndefElts,
5261 SmallVectorImpl<APInt> &EltBits,
5262 bool AllowWholeUndefs = true,
5263 bool AllowPartialUndefs = true) {
5264 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5264, __extension__ __PRETTY_FUNCTION__))
;
5265
5266 Op = peekThroughBitcasts(Op);
5267
5268 EVT VT = Op.getValueType();
5269 unsigned SizeInBits = VT.getSizeInBits();
5270 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5270, __extension__ __PRETTY_FUNCTION__))
;
5271 unsigned NumElts = SizeInBits / EltSizeInBits;
5272
5273 // Bitcast a source array of element bits to the target size.
5274 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5275 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5276 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5277 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5278, __extension__ __PRETTY_FUNCTION__))
5278 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5278, __extension__ __PRETTY_FUNCTION__))
;
5279
5280 // Don't split if we don't allow undef bits.
5281 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5282 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5283 return false;
5284
5285 // If we're already the right size, don't bother bitcasting.
5286 if (NumSrcElts == NumElts) {
5287 UndefElts = UndefSrcElts;
5288 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5289 return true;
5290 }
5291
5292 // Extract all the undef/constant element data and pack into single bitsets.
5293 APInt UndefBits(SizeInBits, 0);
5294 APInt MaskBits(SizeInBits, 0);
5295
5296 for (unsigned i = 0; i != NumSrcElts; ++i) {
5297 unsigned BitOffset = i * SrcEltSizeInBits;
5298 if (UndefSrcElts[i])
5299 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5300 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5301 }
5302
5303 // Split the undef/constant single bitset data into the target elements.
5304 UndefElts = APInt(NumElts, 0);
5305 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5306
5307 for (unsigned i = 0; i != NumElts; ++i) {
5308 unsigned BitOffset = i * EltSizeInBits;
5309 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5310
5311 // Only treat an element as UNDEF if all bits are UNDEF.
5312 if (UndefEltBits.isAllOnesValue()) {
5313 if (!AllowWholeUndefs)
5314 return false;
5315 UndefElts.setBit(i);
5316 continue;
5317 }
5318
5319 // If only some bits are UNDEF then treat them as zero (or bail if not
5320 // supported).
5321 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5322 return false;
5323
5324 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5325 EltBits[i] = Bits.getZExtValue();
5326 }
5327 return true;
5328 };
5329
5330 // Collect constant bits and insert into mask/undef bit masks.
5331 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5332 unsigned UndefBitIndex) {
5333 if (!Cst)
5334 return false;
5335 if (isa<UndefValue>(Cst)) {
5336 Undefs.setBit(UndefBitIndex);
5337 return true;
5338 }
5339 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5340 Mask = CInt->getValue();
5341 return true;
5342 }
5343 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5344 Mask = CFP->getValueAPF().bitcastToAPInt();
5345 return true;
5346 }
5347 return false;
5348 };
5349
5350 // Handle UNDEFs.
5351 if (Op.isUndef()) {
5352 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5353 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5354 return CastBitData(UndefSrcElts, SrcEltBits);
5355 }
5356
5357 // Extract scalar constant bits.
5358 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5359 APInt UndefSrcElts = APInt::getNullValue(1);
5360 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5361 return CastBitData(UndefSrcElts, SrcEltBits);
5362 }
5363
5364 // Extract constant bits from build vector.
5365 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5366 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5367 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5368
5369 APInt UndefSrcElts(NumSrcElts, 0);
5370 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5371 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5372 const SDValue &Src = Op.getOperand(i);
5373 if (Src.isUndef()) {
5374 UndefSrcElts.setBit(i);
5375 continue;
5376 }
5377 auto *Cst = cast<ConstantSDNode>(Src);
5378 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5379 }
5380 return CastBitData(UndefSrcElts, SrcEltBits);
5381 }
5382
5383 // Extract constant bits from constant pool vector.
5384 if (auto *Cst = getTargetConstantFromNode(Op)) {
5385 Type *CstTy = Cst->getType();
5386 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5387 return false;
5388
5389 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5390 unsigned NumSrcElts = CstTy->getVectorNumElements();
5391
5392 APInt UndefSrcElts(NumSrcElts, 0);
5393 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5394 for (unsigned i = 0; i != NumSrcElts; ++i)
5395 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5396 UndefSrcElts, i))
5397 return false;
5398
5399 return CastBitData(UndefSrcElts, SrcEltBits);
5400 }
5401
5402 // Extract constant bits from a broadcasted constant pool scalar.
5403 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5404 EltSizeInBits <= VT.getScalarSizeInBits()) {
5405 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5406 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5407 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5408
5409 APInt UndefSrcElts(NumSrcElts, 0);
5410 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5411 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5412 if (UndefSrcElts[0])
5413 UndefSrcElts.setBits(0, NumSrcElts);
5414 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5415 return CastBitData(UndefSrcElts, SrcEltBits);
5416 }
5417 }
5418 }
5419
5420 // Extract a rematerialized scalar constant insertion.
5421 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5422 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5423 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5424 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5425 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5426
5427 APInt UndefSrcElts(NumSrcElts, 0);
5428 SmallVector<APInt, 64> SrcEltBits;
5429 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5430 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5431 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5432 return CastBitData(UndefSrcElts, SrcEltBits);
5433 }
5434
5435 return false;
5436}
5437
5438static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5439 unsigned MaskEltSizeInBits,
5440 SmallVectorImpl<uint64_t> &RawMask) {
5441 APInt UndefElts;
5442 SmallVector<APInt, 64> EltBits;
5443
5444 // Extract the raw target constant bits.
5445 // FIXME: We currently don't support UNDEF bits or mask entries.
5446 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5447 EltBits, /* AllowWholeUndefs */ false,
5448 /* AllowPartialUndefs */ false))
5449 return false;
5450
5451 // Insert the extracted elements into the mask.
5452 for (APInt Elt : EltBits)
5453 RawMask.push_back(Elt.getZExtValue());
5454
5455 return true;
5456}
5457
5458/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5459/// Note: This ignores saturation, so inputs must be checked first.
5460static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5461 bool Unary) {
5462 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5462, __extension__ __PRETTY_FUNCTION__))
;
5463 unsigned NumElts = VT.getVectorNumElements();
5464 unsigned NumLanes = VT.getSizeInBits() / 128;
5465 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5466 unsigned Offset = Unary ? 0 : NumElts;
5467
5468 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5469 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5470 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5471 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5472 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5473 }
5474}
5475
5476/// Calculates the shuffle mask corresponding to the target-specific opcode.
5477/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5478/// operands in \p Ops, and returns true.
5479/// Sets \p IsUnary to true if only one source is used. Note that this will set
5480/// IsUnary for shuffles which use a single input multiple times, and in those
5481/// cases it will adjust the mask to only have indices within that single input.
5482/// It is an error to call this with non-empty Mask/Ops vectors.
5483static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5484 SmallVectorImpl<SDValue> &Ops,
5485 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5486 unsigned NumElems = VT.getVectorNumElements();
5487 SDValue ImmN;
5488
5489 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5489, __extension__ __PRETTY_FUNCTION__))
;
5490 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5490, __extension__ __PRETTY_FUNCTION__))
;
5491
5492 IsUnary = false;
5493 bool IsFakeUnary = false;
5494 switch(N->getOpcode()) {
5495 case X86ISD::BLENDI:
5496 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5496, __extension__ __PRETTY_FUNCTION__))
;
5497 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5497, __extension__ __PRETTY_FUNCTION__))
;
5498 ImmN = N->getOperand(N->getNumOperands()-1);
5499 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5500 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5501 break;
5502 case X86ISD::SHUFP:
5503 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5503, __extension__ __PRETTY_FUNCTION__))
;
5504 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5504, __extension__ __PRETTY_FUNCTION__))
;
5505 ImmN = N->getOperand(N->getNumOperands()-1);
5506 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5507 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5508 break;
5509 case X86ISD::INSERTPS:
5510 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5510, __extension__ __PRETTY_FUNCTION__))
;
5511 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5511, __extension__ __PRETTY_FUNCTION__))
;
5512 ImmN = N->getOperand(N->getNumOperands()-1);
5513 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5514 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5515 break;
5516 case X86ISD::EXTRQI:
5517 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5517, __extension__ __PRETTY_FUNCTION__))
;
5518 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5519 isa<ConstantSDNode>(N->getOperand(2))) {
5520 int BitLen = N->getConstantOperandVal(1);
5521 int BitIdx = N->getConstantOperandVal(2);
5522 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5523 IsUnary = true;
5524 }
5525 break;
5526 case X86ISD::INSERTQI:
5527 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5527, __extension__ __PRETTY_FUNCTION__))
;
5528 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5528, __extension__ __PRETTY_FUNCTION__))
;
5529 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5530 isa<ConstantSDNode>(N->getOperand(3))) {
5531 int BitLen = N->getConstantOperandVal(2);
5532 int BitIdx = N->getConstantOperandVal(3);
5533 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5534 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5535 }
5536 break;
5537 case X86ISD::UNPCKH:
5538 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5538, __extension__ __PRETTY_FUNCTION__))
;
5539 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5539, __extension__ __PRETTY_FUNCTION__))
;
5540 DecodeUNPCKHMask(VT, Mask);
5541 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5542 break;
5543 case X86ISD::UNPCKL:
5544 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5544, __extension__ __PRETTY_FUNCTION__))
;
5545 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5545, __extension__ __PRETTY_FUNCTION__))
;
5546 DecodeUNPCKLMask(VT, Mask);
5547 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5548 break;
5549 case X86ISD::MOVHLPS:
5550 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5550, __extension__ __PRETTY_FUNCTION__))
;
5551 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5551, __extension__ __PRETTY_FUNCTION__))
;
5552 DecodeMOVHLPSMask(NumElems, Mask);
5553 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5554 break;
5555 case X86ISD::MOVLHPS:
5556 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5556, __extension__ __PRETTY_FUNCTION__))
;
5557 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5557, __extension__ __PRETTY_FUNCTION__))
;
5558 DecodeMOVLHPSMask(NumElems, Mask);
5559 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5560 break;
5561 case X86ISD::PALIGNR:
5562 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5562, __extension__ __PRETTY_FUNCTION__))
;
5563 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5563, __extension__ __PRETTY_FUNCTION__))
;
5564 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5564, __extension__ __PRETTY_FUNCTION__))
;
5565 ImmN = N->getOperand(N->getNumOperands()-1);
5566 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5568 Ops.push_back(N->getOperand(1));
5569 Ops.push_back(N->getOperand(0));
5570 break;
5571 case X86ISD::VSHLDQ:
5572 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5572, __extension__ __PRETTY_FUNCTION__))
;
5573 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5573, __extension__ __PRETTY_FUNCTION__))
;
5574 ImmN = N->getOperand(N->getNumOperands() - 1);
5575 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5576 IsUnary = true;
5577 break;
5578 case X86ISD::VSRLDQ:
5579 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5579, __extension__ __PRETTY_FUNCTION__))
;
5580 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5580, __extension__ __PRETTY_FUNCTION__))
;
5581 ImmN = N->getOperand(N->getNumOperands() - 1);
5582 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5583 IsUnary = true;
5584 break;
5585 case X86ISD::PSHUFD:
5586 case X86ISD::VPERMILPI:
5587 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5587, __extension__ __PRETTY_FUNCTION__))
;
5588 ImmN = N->getOperand(N->getNumOperands()-1);
5589 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5590 IsUnary = true;
5591 break;
5592 case X86ISD::PSHUFHW:
5593 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5593, __extension__ __PRETTY_FUNCTION__))
;
5594 ImmN = N->getOperand(N->getNumOperands()-1);
5595 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5596 IsUnary = true;
5597 break;
5598 case X86ISD::PSHUFLW:
5599 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5599, __extension__ __PRETTY_FUNCTION__))
;
5600 ImmN = N->getOperand(N->getNumOperands()-1);
5601 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5602 IsUnary = true;
5603 break;
5604 case X86ISD::VZEXT_MOVL:
5605 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5605, __extension__ __PRETTY_FUNCTION__))
;
5606 DecodeZeroMoveLowMask(VT, Mask);
5607 IsUnary = true;
5608 break;
5609 case X86ISD::VBROADCAST: {
5610 SDValue N0 = N->getOperand(0);
5611 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5612 // add the pre-extracted value to the Ops vector.
5613 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5614 N0.getOperand(0).getValueType() == VT &&
5615 N0.getConstantOperandVal(1) == 0)
5616 Ops.push_back(N0.getOperand(0));
5617
5618 // We only decode broadcasts of same-sized vectors, unless the broadcast
5619 // came from an extract from the original width. If we found one, we
5620 // pushed it the Ops vector above.
5621 if (N0.getValueType() == VT || !Ops.empty()) {
5622 DecodeVectorBroadcast(VT, Mask);
5623 IsUnary = true;
5624 break;
5625 }
5626 return false;
5627 }
5628 case X86ISD::VPERMILPV: {
5629 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5629, __extension__ __PRETTY_FUNCTION__))
;
5630 IsUnary = true;
5631 SDValue MaskNode = N->getOperand(1);
5632 unsigned MaskEltSize = VT.getScalarSizeInBits();
5633 SmallVector<uint64_t, 32> RawMask;
5634 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5635 DecodeVPERMILPMask(VT, RawMask, Mask);
5636 break;
5637 }
5638 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5639 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5640 break;
5641 }
5642 return false;
5643 }
5644 case X86ISD::PSHUFB: {
5645 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5645, __extension__ __PRETTY_FUNCTION__))
;
5646 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5646, __extension__ __PRETTY_FUNCTION__))
;
5647 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5647, __extension__ __PRETTY_FUNCTION__))
;
5648 IsUnary = true;
5649 SDValue MaskNode = N->getOperand(1);
5650 SmallVector<uint64_t, 32> RawMask;
5651 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5652 DecodePSHUFBMask(RawMask, Mask);
5653 break;
5654 }
5655 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5656 DecodePSHUFBMask(C, Mask);
5657 break;
5658 }
5659 return false;
5660 }
5661 case X86ISD::VPERMI:
5662 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5662, __extension__ __PRETTY_FUNCTION__))
;
5663 ImmN = N->getOperand(N->getNumOperands()-1);
5664 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5665 IsUnary = true;
5666 break;
5667 case X86ISD::MOVSS:
5668 case X86ISD::MOVSD:
5669 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5669, __extension__ __PRETTY_FUNCTION__))
;
5670 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5670, __extension__ __PRETTY_FUNCTION__))
;
5671 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5672 break;
5673 case X86ISD::VPERM2X128:
5674 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5674, __extension__ __PRETTY_FUNCTION__))
;
5675 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5675, __extension__ __PRETTY_FUNCTION__))
;
5676 ImmN = N->getOperand(N->getNumOperands()-1);
5677 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5678 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5679 break;
5680 case X86ISD::MOVSLDUP:
5681 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5681, __extension__ __PRETTY_FUNCTION__))
;
5682 DecodeMOVSLDUPMask(VT, Mask);
5683 IsUnary = true;
5684 break;
5685 case X86ISD::MOVSHDUP:
5686 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5686, __extension__ __PRETTY_FUNCTION__))
;
5687 DecodeMOVSHDUPMask(VT, Mask);
5688 IsUnary = true;
5689 break;
5690 case X86ISD::MOVDDUP:
5691 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5691, __extension__ __PRETTY_FUNCTION__))
;
5692 DecodeMOVDDUPMask(VT, Mask);
5693 IsUnary = true;
5694 break;
5695 case X86ISD::MOVLPD:
5696 case X86ISD::MOVLPS:
5697 // Not yet implemented
5698 return false;
5699 case X86ISD::VPERMIL2: {
5700 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
;
5701 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5701, __extension__ __PRETTY_FUNCTION__))
;
5702 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5703 unsigned MaskEltSize = VT.getScalarSizeInBits();
5704 SDValue MaskNode = N->getOperand(2);
5705 SDValue CtrlNode = N->getOperand(3);
5706 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5707 unsigned CtrlImm = CtrlOp->getZExtValue();
5708 SmallVector<uint64_t, 32> RawMask;
5709 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5710 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5711 break;
5712 }
5713 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5714 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5715 break;
5716 }
5717 }
5718 return false;
5719 }
5720 case X86ISD::VPPERM: {
5721 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5721, __extension__ __PRETTY_FUNCTION__))
;
5722 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5722, __extension__ __PRETTY_FUNCTION__))
;
5723 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5724 SDValue MaskNode = N->getOperand(2);
5725 SmallVector<uint64_t, 32> RawMask;
5726 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5727 DecodeVPPERMMask(RawMask, Mask);
5728 break;
5729 }
5730 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5731 DecodeVPPERMMask(C, Mask);
5732 break;
5733 }
5734 return false;
5735 }
5736 case X86ISD::VPERMV: {
5737 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5737, __extension__ __PRETTY_FUNCTION__))
;
5738 IsUnary = true;
5739 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5740 Ops.push_back(N->getOperand(1));
5741 SDValue MaskNode = N->getOperand(0);
5742 SmallVector<uint64_t, 32> RawMask;
5743 unsigned MaskEltSize = VT.getScalarSizeInBits();
5744 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5745 DecodeVPERMVMask(RawMask, Mask);
5746 break;
5747 }
5748 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5749 DecodeVPERMVMask(C, MaskEltSize, Mask);
5750 break;
5751 }
5752 return false;
5753 }
5754 case X86ISD::VPERMV3: {
5755 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5755, __extension__ __PRETTY_FUNCTION__))
;
5756 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5756, __extension__ __PRETTY_FUNCTION__))
;
5757 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5758 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5759 Ops.push_back(N->getOperand(0));
5760 Ops.push_back(N->getOperand(2));
5761 SDValue MaskNode = N->getOperand(1);
5762 unsigned MaskEltSize = VT.getScalarSizeInBits();
5763 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5764 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5765 break;
5766 }
5767 return false;
5768 }
5769 case X86ISD::VPERMIV3: {
5770 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5770, __extension__ __PRETTY_FUNCTION__))
;
5771 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5771, __extension__ __PRETTY_FUNCTION__))
;
5772 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5773 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5774 Ops.push_back(N->getOperand(1));
5775 Ops.push_back(N->getOperand(2));
5776 SDValue MaskNode = N->getOperand(0);
5777 unsigned MaskEltSize = VT.getScalarSizeInBits();
5778 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5779 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5780 break;
5781 }
5782 return false;
5783 }
5784 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5784)
;
5785 }
5786
5787 // Empty mask indicates the decode failed.
5788 if (Mask.empty())
5789 return false;
5790
5791 // Check if we're getting a shuffle mask with zero'd elements.
5792 if (!AllowSentinelZero)
5793 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5794 return false;
5795
5796 // If we have a fake unary shuffle, the shuffle mask is spread across two
5797 // inputs that are actually the same node. Re-map the mask to always point
5798 // into the first input.
5799 if (IsFakeUnary)
5800 for (int &M : Mask)
5801 if (M >= (int)Mask.size())
5802 M -= Mask.size();
5803
5804 // If we didn't already add operands in the opcode-specific code, default to
5805 // adding 1 or 2 operands starting at 0.
5806 if (Ops.empty()) {
5807 Ops.push_back(N->getOperand(0));
5808 if (!IsUnary || IsFakeUnary)
5809 Ops.push_back(N->getOperand(1));
5810 }
5811
5812 return true;
5813}
5814
5815/// Check a target shuffle mask's inputs to see if we can set any values to
5816/// SM_SentinelZero - this is for elements that are known to be zero
5817/// (not just zeroable) from their inputs.
5818/// Returns true if the target shuffle mask was decoded.
5819static bool setTargetShuffleZeroElements(SDValue N,
5820 SmallVectorImpl<int> &Mask,
5821 SmallVectorImpl<SDValue> &Ops) {
5822 bool IsUnary;
5823 if (!isTargetShuffle(N.getOpcode()))
5824 return false;
5825
5826 MVT VT = N.getSimpleValueType();
5827 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5828 return false;
5829
5830 SDValue V1 = Ops[0];
5831 SDValue V2 = IsUnary ? V1 : Ops[1];
5832
5833 V1 = peekThroughBitcasts(V1);
5834 V2 = peekThroughBitcasts(V2);
5835
5836 assert((VT.getSizeInBits() % Mask.size()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Mask.size())
== 0 && "Illegal split of shuffle value type") ? void
(0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5837, __extension__ __PRETTY_FUNCTION__))
5837 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Mask.size())
== 0 && "Illegal split of shuffle value type") ? void
(0) : __assert_fail ("(VT.getSizeInBits() % Mask.size()) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5837, __extension__ __PRETTY_FUNCTION__))
;
5838 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5839
5840 // Extract known constant input data.
5841 APInt UndefSrcElts[2];
5842 SmallVector<APInt, 32> SrcEltBits[2];
5843 bool IsSrcConstant[2] = {
5844 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5845 SrcEltBits[0], true, false),
5846 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5847 SrcEltBits[1], true, false)};
5848
5849 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5850 int M = Mask[i];
5851
5852 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5853 if (M < 0)
5854 continue;
5855
5856 // Determine shuffle input and normalize the mask.
5857 unsigned SrcIdx = M / Size;
5858 SDValue V = M < Size ? V1 : V2;
5859 M %= Size;
5860
5861 // We are referencing an UNDEF input.
5862 if (V.isUndef()) {
5863 Mask[i] = SM_SentinelUndef;
5864 continue;
5865 }
5866
5867 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5868 // TODO: We currently only set UNDEF for integer types - floats use the same
5869 // registers as vectors and many of the scalar folded loads rely on the
5870 // SCALAR_TO_VECTOR pattern.
5871 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5872 (Size % V.getValueType().getVectorNumElements()) == 0) {
5873 int Scale = Size / V.getValueType().getVectorNumElements();
5874 int Idx = M / Scale;
5875 if (Idx != 0 && !VT.isFloatingPoint())
5876 Mask[i] = SM_SentinelUndef;
5877 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5878 Mask[i] = SM_SentinelZero;
5879 continue;
5880 }
5881
5882 // Attempt to extract from the source's constant bits.
5883 if (IsSrcConstant[SrcIdx]) {
5884 if (UndefSrcElts[SrcIdx][M])
5885 Mask[i] = SM_SentinelUndef;
5886 else if (SrcEltBits[SrcIdx][M] == 0)
5887 Mask[i] = SM_SentinelZero;
5888 }
5889 }
5890
5891 assert(VT.getVectorNumElements() == Mask.size() &&(static_cast <bool> (VT.getVectorNumElements() == Mask.
size() && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5892, __extension__ __PRETTY_FUNCTION__))
5892 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == Mask.
size() && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == Mask.size() && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5892, __extension__ __PRETTY_FUNCTION__))
;
5893 return true;
5894}
5895
5896// Attempt to decode ops that could be represented as a shuffle mask.
5897// The decoded shuffle mask may contain a different number of elements to the
5898// destination value type.
5899static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5900 SmallVectorImpl<SDValue> &Ops,
5901 SelectionDAG &DAG) {
5902 Mask.clear();
5903 Ops.clear();
5904
5905 MVT VT = N.getSimpleValueType();
5906 unsigned NumElts = VT.getVectorNumElements();
5907 unsigned NumSizeInBits = VT.getSizeInBits();
5908 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5909 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&(static_cast <bool> ((NumBitsPerElt % 8) == 0 &&
(NumSizeInBits % 8) == 0 && "Expected byte aligned value types"
) ? void (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5910, __extension__ __PRETTY_FUNCTION__))
5910 "Expected byte aligned value types")(static_cast <bool> ((NumBitsPerElt % 8) == 0 &&
(NumSizeInBits % 8) == 0 && "Expected byte aligned value types"
) ? void (0) : __assert_fail ("(NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && \"Expected byte aligned value types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5910, __extension__ __PRETTY_FUNCTION__))
;
5911
5912 unsigned Opcode = N.getOpcode();
5913 switch (Opcode) {
5914 case ISD::AND:
5915 case X86ISD::ANDNP: {
5916 // Attempt to decode as a per-byte mask.
5917 APInt UndefElts;
5918 SmallVector<APInt, 32> EltBits;
5919 SDValue N0 = N.getOperand(0);
5920 SDValue N1 = N.getOperand(1);
5921 bool IsAndN = (X86ISD::ANDNP == Opcode);
5922 uint64_t ZeroMask = IsAndN ? 255 : 0;
5923 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5924 return false;
5925 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5926 if (UndefElts[i]) {
5927 Mask.push_back(SM_SentinelUndef);
5928 continue;
5929 }
5930 uint64_t ByteBits = EltBits[i].getZExtValue();
5931 if (ByteBits != 0 && ByteBits != 255)
5932 return false;
5933 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5934 }
5935 Ops.push_back(IsAndN ? N1 : N0);
5936 return true;
5937 }
5938 case ISD::SCALAR_TO_VECTOR: {
5939 // Match against a scalar_to_vector of an extract from a vector,
5940 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5941 SDValue N0 = N.getOperand(0);
5942 SDValue SrcExtract;
5943
5944 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5945 N0.getOperand(0).getValueType() == VT) ||
5946 (N0.getOpcode() == X86ISD::PEXTRW &&
5947 N0.getOperand(0).getValueType() == MVT::v8i16) ||
5948 (N0.getOpcode() == X86ISD::PEXTRB &&
5949 N0.getOperand(0).getValueType() == MVT::v16i8)) {
5950 SrcExtract = N0;
5951 }
5952
5953 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5954 return false;
5955
5956 SDValue SrcVec = SrcExtract.getOperand(0);
5957 EVT SrcVT = SrcVec.getValueType();
5958 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5959 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5960
5961 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5962 if (NumSrcElts <= SrcIdx)
5963 return false;
5964
5965 Ops.push_back(SrcVec);
5966 Mask.push_back(SrcIdx);
5967 Mask.append(NumZeros, SM_SentinelZero);
5968 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5969 return true;
5970 }
5971 case X86ISD::PINSRB:
5972 case X86ISD::PINSRW: {
5973 SDValue InVec = N.getOperand(0);
5974 SDValue InScl = N.getOperand(1);
5975 uint64_t InIdx = N.getConstantOperandVal(2);
5976 assert(InIdx < NumElts && "Illegal insertion index")(static_cast <bool> (InIdx < NumElts && "Illegal insertion index"
) ? void (0) : __assert_fail ("InIdx < NumElts && \"Illegal insertion index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5976, __extension__ __PRETTY_FUNCTION__))
;
5977
5978 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5979 if (X86::isZeroNode(InScl)) {
5980 Ops.push_back(InVec);
5981 for (unsigned i = 0; i != NumElts; ++i)
5982 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5983 return true;
5984 }
5985
5986 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
5987 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5988 unsigned ExOp =
5989 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5990 if (InScl.getOpcode() != ExOp)
5991 return false;
5992
5993 SDValue ExVec = InScl.getOperand(0);
5994 uint64_t ExIdx = InScl.getConstantOperandVal(1);
5995 assert(ExIdx < NumElts && "Illegal extraction index")(static_cast <bool> (ExIdx < NumElts && "Illegal extraction index"
) ? void (0) : __assert_fail ("ExIdx < NumElts && \"Illegal extraction index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 5995, __extension__ __PRETTY_FUNCTION__))
;
5996 Ops.push_back(InVec);
5997 Ops.push_back(ExVec);
5998 for (unsigned i = 0; i != NumElts; ++i)
5999 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6000 return true;
6001 }
6002 case X86ISD::PACKSS:
6003 case X86ISD::PACKUS: {
6004 SDValue N0 = N.getOperand(0);
6005 SDValue N1 = N.getOperand(1);
6006 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__))
6007 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__))
6008 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6008, __extension__ __PRETTY_FUNCTION__))
;
6009
6010 // If we know input saturation won't happen we can treat this
6011 // as a truncation shuffle.
6012 if (Opcode == X86ISD::PACKSS) {
6013 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6014 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6015 return false;
6016 } else {
6017 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6018 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6019 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6020 return false;
6021 }
6022
6023 bool IsUnary = (N0 == N1);
6024
6025 Ops.push_back(N0);
6026 if (!IsUnary)
6027 Ops.push_back(N1);
6028
6029 createPackShuffleMask(VT, Mask, IsUnary);
6030 return true;
6031 }
6032 case X86ISD::VSHLI:
6033 case X86ISD::VSRLI: {
6034 uint64_t ShiftVal = N.getConstantOperandVal(1);
6035 // Out of range bit shifts are guaranteed to be zero.
6036 if (NumBitsPerElt <= ShiftVal) {
6037 Mask.append(NumElts, SM_SentinelZero);
6038 return true;
6039 }
6040
6041 // We can only decode 'whole byte' bit shifts as shuffles.
6042 if ((ShiftVal % 8) != 0)
6043 break;
6044
6045 uint64_t ByteShift = ShiftVal / 8;
6046 unsigned NumBytes = NumSizeInBits / 8;
6047 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6048 Ops.push_back(N.getOperand(0));
6049
6050 // Clear mask to all zeros and insert the shifted byte indices.
6051 Mask.append(NumBytes, SM_SentinelZero);
6052
6053 if (X86ISD::VSHLI == Opcode) {
6054 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6055 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6056 Mask[i + j] = i + j - ByteShift;
6057 } else {
6058 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6059 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6060 Mask[i + j - ByteShift] = i + j;
6061 }
6062 return true;
6063 }
6064 case ISD::ZERO_EXTEND_VECTOR_INREG:
6065 case X86ISD::VZEXT: {
6066 // TODO - add support for VPMOVZX with smaller input vector types.
6067 SDValue Src = N.getOperand(0);
6068 MVT SrcVT = Src.getSimpleValueType();
6069 if (NumSizeInBits != SrcVT.getSizeInBits())
6070 break;
6071 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6072 Ops.push_back(Src);
6073 return true;
6074 }
6075 }
6076
6077 return false;
6078}
6079
6080/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6081static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6082 SmallVectorImpl<int> &Mask) {
6083 int MaskWidth = Mask.size();
6084 SmallVector<SDValue, 16> UsedInputs;
6085 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6086 int lo = UsedInputs.size() * MaskWidth;
6087 int hi = lo + MaskWidth;
6088
6089 // Strip UNDEF input usage.
6090 if (Inputs[i].isUndef())
6091 for (int &M : Mask)
6092 if ((lo <= M) && (M < hi))
6093 M = SM_SentinelUndef;
6094
6095 // Check for unused inputs.
6096 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6097 UsedInputs.push_back(Inputs[i]);
6098 continue;
6099 }
6100 for (int &M : Mask)
6101 if (lo <= M)
6102 M -= MaskWidth;
6103 }
6104 Inputs = UsedInputs;
6105}
6106
6107/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6108/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6109/// remaining input indices in case we now have a unary shuffle and adjust the
6110/// inputs accordingly.
6111/// Returns true if the target shuffle mask was decoded.
6112static bool resolveTargetShuffleInputs(SDValue Op,
6113 SmallVectorImpl<SDValue> &Inputs,
6114 SmallVectorImpl<int> &Mask,
6115 SelectionDAG &DAG) {
6116 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6117 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6118 return false;
6119
6120 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6121 return true;
6122}
6123
6124/// Returns the scalar element that will make up the ith
6125/// element of the result of the vector shuffle.
6126static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6127 unsigned Depth) {
6128 if (Depth == 6)
6129 return SDValue(); // Limit search depth.
6130
6131 SDValue V = SDValue(N, 0);
6132 EVT VT = V.getValueType();
6133 unsigned Opcode = V.getOpcode();
6134
6135 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6136 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6137 int Elt = SV->getMaskElt(Index);
6138
6139 if (Elt < 0)
6140 return DAG.getUNDEF(VT.getVectorElementType());
6141
6142 unsigned NumElems = VT.getVectorNumElements();
6143 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6144 : SV->getOperand(1);
6145 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6146 }
6147
6148 // Recurse into target specific vector shuffles to find scalars.
6149 if (isTargetShuffle(Opcode)) {
6150 MVT ShufVT = V.getSimpleValueType();
6151 MVT ShufSVT = ShufVT.getVectorElementType();
6152 int NumElems = (int)ShufVT.getVectorNumElements();
6153 SmallVector<int, 16> ShuffleMask;
6154 SmallVector<SDValue, 16> ShuffleOps;
6155 bool IsUnary;
6156
6157 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6158 return SDValue();
6159
6160 int Elt = ShuffleMask[Index];
6161 if (Elt == SM_SentinelZero)
6162 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6163 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6164 if (Elt == SM_SentinelUndef)
6165 return DAG.getUNDEF(ShufSVT);
6166
6167 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
*NumElems) && "Shuffle index out of range") ? void (0
) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6167, __extension__ __PRETTY_FUNCTION__))
;
6168 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6169 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6170 Depth+1);
6171 }
6172
6173 // Actual nodes that may contain scalar elements
6174 if (Opcode == ISD::BITCAST) {
6175 V = V.getOperand(0);
6176 EVT SrcVT = V.getValueType();
6177 unsigned NumElems = VT.getVectorNumElements();
6178
6179 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6180 return SDValue();
6181 }
6182
6183 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6184 return (Index == 0) ? V.getOperand(0)
6185 : DAG.getUNDEF(VT.getVectorElementType());
6186
6187 if (V.getOpcode() == ISD::BUILD_VECTOR)
6188 return V.getOperand(Index);
6189
6190 return SDValue();
6191}
6192
6193// Use PINSRB/PINSRW/PINSRD to create a build vector.
6194static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6195 unsigned NumNonZero, unsigned NumZero,
6196 SelectionDAG &DAG,
6197 const X86Subtarget &Subtarget) {
6198 MVT VT = Op.getSimpleValueType();
6199 unsigned NumElts = VT.getVectorNumElements();
6200 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__))
6201 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__))
6202 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6202, __extension__ __PRETTY_FUNCTION__))
;
6203
6204 SDLoc dl(Op);
6205 SDValue V;
6206 bool First = true;
6207
6208 for (unsigned i = 0; i < NumElts; ++i) {
6209 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6210 if (!IsNonZero)
6211 continue;
6212
6213 // If the build vector contains zeros or our first insertion is not the
6214 // first index then insert into zero vector to break any register
6215 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6216 if (First) {
6217 First = false;
6218 if (NumZero || 0 != i)
6219 V = getZeroVector(VT, Subtarget, DAG, dl);
6220 else {
6221 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6221, __extension__ __PRETTY_FUNCTION__))
;
6222 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6223 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6224 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6225 V = DAG.getBitcast(VT, V);
6226 continue;
6227 }
6228 }
6229 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6230 DAG.getIntPtrConstant(i, dl));
6231 }
6232
6233 return V;
6234}
6235
6236/// Custom lower build_vector of v16i8.
6237static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6238 unsigned NumNonZero, unsigned NumZero,
6239 SelectionDAG &DAG,
6240 const X86Subtarget &Subtarget) {
6241 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6242 return SDValue();
6243
6244 // SSE4.1 - use PINSRB to insert each byte directly.
6245 if (Subtarget.hasSSE41())
6246 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6247 Subtarget);
6248
6249 SDLoc dl(Op);
6250 SDValue V;
6251 bool First = true;
6252
6253 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6254 for (unsigned i = 0; i < 16; ++i) {
6255 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6256 if (ThisIsNonZero && First) {
6257 if (NumZero)
6258 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6259 else
6260 V = DAG.getUNDEF(MVT::v8i16);
6261 First = false;
6262 }
6263
6264 if ((i & 1) != 0) {
6265 // FIXME: Investigate extending to i32 instead of just i16.
6266 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6267 SDValue ThisElt, LastElt;
6268 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6269 if (LastIsNonZero) {
6270 LastElt =
6271 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6272 }
6273 if (ThisIsNonZero) {
6274 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6275 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6276 DAG.getConstant(8, dl, MVT::i8));
6277 if (LastIsNonZero)
6278 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6279 } else
6280 ThisElt = LastElt;
6281
6282 if (ThisElt) {
6283 if (1 == i) {
6284 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6285 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6286 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6287 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6288 V = DAG.getBitcast(MVT::v8i16, V);
6289 } else {
6290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6291 DAG.getIntPtrConstant(i / 2, dl));
6292 }
6293 }
6294 }
6295 }
6296
6297 return DAG.getBitcast(MVT::v16i8, V);
6298}
6299
6300/// Custom lower build_vector of v8i16.
6301static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6302 unsigned NumNonZero, unsigned NumZero,
6303 SelectionDAG &DAG,
6304 const X86Subtarget &Subtarget) {
6305 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6306 return SDValue();
6307
6308 // Use PINSRW to insert each byte directly.
6309 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6310 Subtarget);
6311}
6312
6313/// Custom lower build_vector of v4i32 or v4f32.
6314static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6315 const X86Subtarget &Subtarget) {
6316 // Find all zeroable elements.
6317 std::bitset<4> Zeroable;
6318 for (int i=0; i < 4; ++i) {
6319 SDValue Elt = Op->getOperand(i);
6320 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6321 }
6322 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6323, __extension__ __PRETTY_FUNCTION__))
6323 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6323, __extension__ __PRETTY_FUNCTION__))
;
6324
6325 // We only know how to deal with build_vector nodes where elements are either
6326 // zeroable or extract_vector_elt with constant index.
6327 SDValue FirstNonZero;
6328 unsigned FirstNonZeroIdx;
6329 for (unsigned i=0; i < 4; ++i) {
6330 if (Zeroable[i])
6331 continue;
6332 SDValue Elt = Op->getOperand(i);
6333 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6334 !isa<ConstantSDNode>(Elt.getOperand(1)))
6335 return SDValue();
6336 // Make sure that this node is extracting from a 128-bit vector.
6337 MVT VT = Elt.getOperand(0).getSimpleValueType();
6338 if (!VT.is128BitVector())
6339 return SDValue();
6340 if (!FirstNonZero.getNode()) {
6341 FirstNonZero = Elt;
6342 FirstNonZeroIdx = i;
6343 }
6344 }
6345
6346 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6346, __extension__ __PRETTY_FUNCTION__))
;
6347 SDValue V1 = FirstNonZero.getOperand(0);
6348 MVT VT = V1.getSimpleValueType();
6349
6350 // See if this build_vector can be lowered as a blend with zero.
6351 SDValue Elt;
6352 unsigned EltMaskIdx, EltIdx;
6353 int Mask[4];
6354 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6355 if (Zeroable[EltIdx]) {
6356 // The zero vector will be on the right hand side.
6357 Mask[EltIdx] = EltIdx+4;
6358 continue;
6359 }
6360
6361 Elt = Op->getOperand(EltIdx);
6362 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6363 EltMaskIdx = Elt.getConstantOperandVal(1);
6364 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6365 break;
6366 Mask[EltIdx] = EltIdx;
6367 }
6368
6369 if (EltIdx == 4) {
6370 // Let the shuffle legalizer deal with blend operations.
6371 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6372 if (V1.getSimpleValueType() != VT)
6373 V1 = DAG.getBitcast(VT, V1);
6374 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6375 }
6376
6377 // See if we can lower this build_vector to a INSERTPS.
6378 if (!Subtarget.hasSSE41())
6379 return SDValue();
6380
6381 SDValue V2 = Elt.getOperand(0);
6382 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6383 V1 = SDValue();
6384
6385 bool CanFold = true;
6386 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6387 if (Zeroable[i])
6388 continue;
6389
6390 SDValue Current = Op->getOperand(i);
6391 SDValue SrcVector = Current->getOperand(0);
6392 if (!V1.getNode())
6393 V1 = SrcVector;
6394 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6395 }
6396
6397 if (!CanFold)
6398 return SDValue();
6399
6400 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6400, __extension__ __PRETTY_FUNCTION__))
;
6401 if (V1.getSimpleValueType() != MVT::v4f32)
6402 V1 = DAG.getBitcast(MVT::v4f32, V1);
6403 if (V2.getSimpleValueType() != MVT::v4f32)
6404 V2 = DAG.getBitcast(MVT::v4f32, V2);
6405
6406 // Ok, we can emit an INSERTPS instruction.
6407 unsigned ZMask = Zeroable.to_ulong();
6408
6409 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6410 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6410, __extension__ __PRETTY_FUNCTION__))
;
6411 SDLoc DL(Op);
6412 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6413 DAG.getIntPtrConstant(InsertPSMask, DL));
6414 return DAG.getBitcast(VT, Result);
6415}
6416
6417/// Return a vector logical shift node.
6418static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6419 SelectionDAG &DAG, const TargetLowering &TLI,
6420 const SDLoc &dl) {
6421 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6421, __extension__ __PRETTY_FUNCTION__))
;
6422 MVT ShVT = MVT::v16i8;
6423 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6424 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6425 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6426 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6426, __extension__ __PRETTY_FUNCTION__))
;
6427 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6428 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6429}
6430
6431static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6432 SelectionDAG &DAG) {
6433
6434 // Check if the scalar load can be widened into a vector load. And if
6435 // the address is "base + cst" see if the cst can be "absorbed" into
6436 // the shuffle mask.
6437 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6438 SDValue Ptr = LD->getBasePtr();
6439 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6440 return SDValue();
6441 EVT PVT = LD->getValueType(0);
6442 if (PVT != MVT::i32 && PVT != MVT::f32)
6443 return SDValue();
6444
6445 int FI = -1;
6446 int64_t Offset = 0;
6447 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6448 FI = FINode->getIndex();
6449 Offset = 0;
6450 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6451 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6452 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6453 Offset = Ptr.getConstantOperandVal(1);
6454 Ptr = Ptr.getOperand(0);
6455 } else {
6456 return SDValue();
6457 }
6458
6459 // FIXME: 256-bit vector instructions don't require a strict alignment,
6460 // improve this code to support it better.
6461 unsigned RequiredAlign = VT.getSizeInBits()/8;
6462 SDValue Chain = LD->getChain();
6463 // Make sure the stack object alignment is at least 16 or 32.
6464 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6465 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6466 if (MFI.isFixedObjectIndex(FI)) {
6467 // Can't change the alignment. FIXME: It's possible to compute
6468 // the exact stack offset and reference FI + adjust offset instead.
6469 // If someone *really* cares about this. That's the way to implement it.
6470 return SDValue();
6471 } else {
6472 MFI.setObjectAlignment(FI, RequiredAlign);
6473 }
6474 }
6475
6476 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6477 // Ptr + (Offset & ~15).
6478 if (Offset < 0)
6479 return SDValue();
6480 if ((Offset % RequiredAlign) & 3)
6481 return SDValue();
6482 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6483 if (StartOffset) {
6484 SDLoc DL(Ptr);
6485 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6486 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6487 }
6488
6489 int EltNo = (Offset - StartOffset) >> 2;
6490 unsigned NumElems = VT.getVectorNumElements();
6491
6492 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6493 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6494 LD->getPointerInfo().getWithOffset(StartOffset));
6495
6496 SmallVector<int, 8> Mask(NumElems, EltNo);
6497
6498 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6499 }
6500
6501 return SDValue();
6502}
6503
6504/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6505/// elements can be replaced by a single large load which has the same value as
6506/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6507///
6508/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6509static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6510 const SDLoc &DL, SelectionDAG &DAG,
6511 const X86Subtarget &Subtarget,
6512 bool isAfterLegalize) {
6513 unsigned NumElems = Elts.size();
6514
6515 int LastLoadedElt = -1;
6516 SmallBitVector LoadMask(NumElems, false);
6
Calling constructor for 'SmallBitVector'
10
Returning from constructor for 'SmallBitVector'
6517 SmallBitVector ZeroMask(NumElems, false);
6518 SmallBitVector UndefMask(NumElems, false);
6519
6520 // For each element in the initializer, see if we've found a load, zero or an
6521 // undef.
6522 for (unsigned i = 0; i < NumElems; ++i) {
11
Loop condition is true. Entering loop body
6523 SDValue Elt = peekThroughBitcasts(Elts[i]);
6524 if (!Elt.getNode())
12
Assuming the condition is false
13
Taking false branch
6525 return SDValue();
6526
6527 if (Elt.isUndef())
14
Taking false branch
6528 UndefMask[i] = true;
6529 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
15
Assuming the condition is false
16
Assuming the condition is false
17
Taking false branch
6530 ZeroMask[i] = true;
6531 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
18
Taking false branch
6532 LoadMask[i] = true;
6533 LastLoadedElt = i;
6534 // Each loaded element must be the correct fractional portion of the
6535 // requested vector load.
6536 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6537 return SDValue();
6538 } else
6539 return SDValue();
6540 }
6541 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&(static_cast <bool> ((ZeroMask | UndefMask | LoadMask).
count() == NumElems && "Incomplete element masks") ? void
(0) : __assert_fail ("(ZeroMask | UndefMask | LoadMask).count() == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__))
6542 "Incomplete element masks")(static_cast <bool> ((ZeroMask | UndefMask | LoadMask).
count() == NumElems && "Incomplete element masks") ? void
(0) : __assert_fail ("(ZeroMask | UndefMask | LoadMask).count() == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__))
;
6543
6544 // Handle Special Cases - all undef or undef/zero.
6545 if (UndefMask.count() == NumElems)
6546 return DAG.getUNDEF(VT);
6547
6548 // FIXME: Should we return this as a BUILD_VECTOR instead?
6549 if ((ZeroMask | UndefMask).count() == NumElems)
6550 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6551 : DAG.getConstantFP(0.0, DL, VT);
6552
6553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6554 int FirstLoadedElt = LoadMask.find_first();
6555 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6556 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6557 EVT LDBaseVT = EltBase.getValueType();
6558
6559 // Consecutive loads can contain UNDEFS but not ZERO elements.
6560 // Consecutive loads with UNDEFs and ZEROs elements require a
6561 // an additional shuffle stage to clear the ZERO elements.
6562 bool IsConsecutiveLoad = true;
6563 bool IsConsecutiveLoadWithZeros = true;
6564 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6565 if (LoadMask[i]) {
6566 SDValue Elt = peekThroughBitcasts(Elts[i]);
6567 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6568 if (!DAG.areNonVolatileConsecutiveLoads(
6569 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6570 i - FirstLoadedElt)) {
6571 IsConsecutiveLoad = false;
6572 IsConsecutiveLoadWithZeros = false;
6573 break;
6574 }
6575 } else if (ZeroMask[i]) {
6576 IsConsecutiveLoad = false;
6577 }
6578 }
6579
6580 SmallVector<LoadSDNode *, 8> Loads;
6581 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6582 if (LoadMask[i])
6583 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6584
6585 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6586 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6587 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&(static_cast <bool> (!(MMOFlags & MachineMemOperand
::MOVolatile) && "Cannot merge volatile loads.") ? void
(0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))
6588 "Cannot merge volatile loads.")(static_cast <bool> (!(MMOFlags & MachineMemOperand
::MOVolatile) && "Cannot merge volatile loads.") ? void
(0) : __assert_fail ("!(MMOFlags & MachineMemOperand::MOVolatile) && \"Cannot merge volatile loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))
;
6589 SDValue NewLd =
6590 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6591 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6592 for (auto *LD : Loads)
6593 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6594 return NewLd;
6595 };
6596
6597 // LOAD - all consecutive load/undefs (must start/end with a load).
6598 // If we have found an entire vector of loads and undefs, then return a large
6599 // load of the entire vector width starting at the base pointer.
6600 // If the vector contains zeros, then attempt to shuffle those elements.
6601 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6602 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6603 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6603, __extension__ __PRETTY_FUNCTION__))
;
6604 EVT EltVT = LDBase->getValueType(0);
6605 // Ensure that the input vector size for the merged loads matches the
6606 // cumulative size of the input elements.
6607 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6608 return SDValue();
6609
6610 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6611 return SDValue();
6612
6613 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6614 // will lower to regular temporal loads and use the cache.
6615 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6616 VT.is256BitVector() && !Subtarget.hasInt256())
6617 return SDValue();
6618
6619 if (IsConsecutiveLoad)
6620 return CreateLoad(VT, LDBase);
6621
6622 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6623 // vector and a zero vector to clear out the zero elements.
6624 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6625 SmallVector<int, 4> ClearMask(NumElems, -1);
6626 for (unsigned i = 0; i < NumElems; ++i) {
6627 if (ZeroMask[i])
6628 ClearMask[i] = i + NumElems;
6629 else if (LoadMask[i])
6630 ClearMask[i] = i;
6631 }
6632 SDValue V = CreateLoad(VT, LDBase);
6633 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6634 : DAG.getConstantFP(0.0, DL, VT);
6635 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6636 }
6637 }
6638
6639 int LoadSize =
6640 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6641
6642 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6643 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6644 (LoadSize == 32 || LoadSize == 64) &&
6645 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6646 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6647 : MVT::getIntegerVT(LoadSize);
6648 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6649 if (TLI.isTypeLegal(VecVT)) {
6650 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6651 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6652 SDValue ResNode =
6653 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6654 LDBase->getPointerInfo(),
6655 LDBase->getAlignment(),
6656 false/*isVolatile*/, true/*ReadMem*/,
6657 false/*WriteMem*/);
6658 for (auto *LD : Loads)
6659 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6660 return DAG.getBitcast(VT, ResNode);
6661 }
6662 }
6663
6664 return SDValue();
6665}
19
Potential leak of memory pointed to by 'LoadMask.X'
6666
6667static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6668 unsigned SplatBitSize, LLVMContext &C) {
6669 unsigned ScalarSize = VT.getScalarSizeInBits();
6670 unsigned NumElm = SplatBitSize / ScalarSize;
6671
6672 SmallVector<Constant *, 32> ConstantVec;
6673 for (unsigned i = 0; i < NumElm; i++) {
6674 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6675 Constant *Const;
6676 if (VT.isFloatingPoint()) {
6677 if (ScalarSize == 32) {
6678 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6679 } else {
6680 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6680, __extension__ __PRETTY_FUNCTION__))
;
6681 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6682 }
6683 } else
6684 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6685 ConstantVec.push_back(Const);
6686 }
6687 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6688}
6689
6690static bool isUseOfShuffle(SDNode *N) {
6691 for (auto *U : N->uses()) {
6692 if (isTargetShuffle(U->getOpcode()))
6693 return true;
6694 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6695 return isUseOfShuffle(U);
6696 }
6697 return false;
6698}
6699
6700// Check if the current node of build vector is a zero extended vector.
6701// // If so, return the value extended.
6702// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6703// // NumElt - return the number of zero extended identical values.
6704// // EltType - return the type of the value include the zero extend.
6705static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6706 unsigned &NumElt, MVT &EltType) {
6707 SDValue ExtValue = Op->getOperand(0);
6708 unsigned NumElts = Op->getNumOperands();
6709 unsigned Delta = NumElts;
6710
6711 for (unsigned i = 1; i < NumElts; i++) {
6712 if (Op->getOperand(i) == ExtValue) {
6713 Delta = i;
6714 break;
6715 }
6716 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6717 return SDValue();
6718 }
6719 if (!isPowerOf2_32(Delta) || Delta == 1)
6720 return SDValue();
6721
6722 for (unsigned i = Delta; i < NumElts; i++) {
6723 if (i % Delta == 0) {
6724 if (Op->getOperand(i) != ExtValue)
6725 return SDValue();
6726 } else if (!(isNullConstant(Op->getOperand(i)) ||
6727 Op->getOperand(i).isUndef()))
6728 return SDValue();
6729 }
6730 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6731 unsigned ExtVTSize = EltSize * Delta;
6732 EltType = MVT::getIntegerVT(ExtVTSize);
6733 NumElt = NumElts / Delta;
6734 return ExtValue;
6735}
6736
6737/// Attempt to use the vbroadcast instruction to generate a splat value
6738/// from a splat BUILD_VECTOR which uses:
6739/// a. A single scalar load, or a constant.
6740/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6741///
6742/// The VBROADCAST node is returned when a pattern is found,
6743/// or SDValue() otherwise.
6744static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6745 const X86Subtarget &Subtarget,
6746 SelectionDAG &DAG) {
6747 // VBROADCAST requires AVX.
6748 // TODO: Splats could be generated for non-AVX CPUs using SSE
6749 // instructions, but there's less potential gain for only 128-bit vectors.
6750 if (!Subtarget.hasAVX())
6751 return SDValue();
6752
6753 MVT VT = BVOp->getSimpleValueType(0);
6754 SDLoc dl(BVOp);
6755
6756 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6757, __extension__ __PRETTY_FUNCTION__))
6757 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6757, __extension__ __PRETTY_FUNCTION__))
;
6758
6759 BitVector UndefElements;
6760 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6761
6762 // Attempt to use VBROADCASTM
6763 // From this paterrn:
6764 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6765 // b. t1 = (build_vector t0 t0)
6766 //
6767 // Create (VBROADCASTM v2i1 X)
6768 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6769 MVT EltType = VT.getScalarType();
6770 unsigned NumElts = VT.getVectorNumElements();
6771 SDValue BOperand;
6772 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6773 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6774 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6775 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6776 if (ZeroExtended)
6777 BOperand = ZeroExtended.getOperand(0);
6778 else
6779 BOperand = Ld.getOperand(0).getOperand(0);
6780 if (BOperand.getValueType().isVector() &&
6781 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6782 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6783 NumElts == 8)) || // for broadcastmb2q
6784 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6785 NumElts == 16))) { // for broadcastmw2d
6786 SDValue Brdcst =
6787 DAG.getNode(X86ISD::VBROADCASTM, dl,
6788 MVT::getVectorVT(EltType, NumElts), BOperand);
6789 return DAG.getBitcast(VT, Brdcst);
6790 }
6791 }
6792 }
6793 }
6794
6795 // We need a splat of a single value to use broadcast, and it doesn't
6796 // make any sense if the value is only in one element of the vector.
6797 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6798 APInt SplatValue, Undef;
6799 unsigned SplatBitSize;
6800 bool HasUndef;
6801 // Check if this is a repeated constant pattern suitable for broadcasting.
6802 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6803 SplatBitSize > VT.getScalarSizeInBits() &&
6804 SplatBitSize < VT.getSizeInBits()) {
6805 // Avoid replacing with broadcast when it's a use of a shuffle
6806 // instruction to preserve the present custom lowering of shuffles.
6807 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6808 return SDValue();
6809 // replace BUILD_VECTOR with broadcast of the repeated constants.
6810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6811 LLVMContext *Ctx = DAG.getContext();
6812 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6813 if (Subtarget.hasAVX()) {
6814 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6815 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6816 // Splatted value can fit in one INTEGER constant in constant pool.
6817 // Load the constant and broadcast it.
6818 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6819 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6820 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6821 SDValue CP = DAG.getConstantPool(C, PVT);
6822 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6823
6824 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6825 Ld = DAG.getLoad(
6826 CVT, dl, DAG.getEntryNode(), CP,
6827 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6828 Alignment);
6829 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6830 MVT::getVectorVT(CVT, Repeat), Ld);
6831 return DAG.getBitcast(VT, Brdcst);
6832 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6833 // Splatted value can fit in one FLOAT constant in constant pool.
6834 // Load the constant and broadcast it.
6835 // AVX have support for 32 and 64 bit broadcast for floats only.
6836 // No 64bit integer in 32bit subtarget.
6837 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6838 // Lower the splat via APFloat directly, to avoid any conversion.
6839 Constant *C =
6840 SplatBitSize == 32
6841 ? ConstantFP::get(*Ctx,
6842 APFloat(APFloat::IEEEsingle(), SplatValue))
6843 : ConstantFP::get(*Ctx,
6844 APFloat(APFloat::IEEEdouble(), SplatValue));
6845 SDValue CP = DAG.getConstantPool(C, PVT);
6846 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6847
6848 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6849 Ld = DAG.getLoad(
6850 CVT, dl, DAG.getEntryNode(), CP,
6851 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6852 Alignment);
6853 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6854 MVT::getVectorVT(CVT, Repeat), Ld);
6855 return DAG.getBitcast(VT, Brdcst);
6856 } else if (SplatBitSize > 64) {
6857 // Load the vector of constants and broadcast it.
6858 MVT CVT = VT.getScalarType();
6859 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6860 *Ctx);
6861 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6862 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6863 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6864 Ld = DAG.getLoad(
6865 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6866 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6867 Alignment);
6868 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6869 return DAG.getBitcast(VT, Brdcst);
6870 }
6871 }
6872 }
6873 return SDValue();
6874 }
6875
6876 bool ConstSplatVal =
6877 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6878
6879 // Make sure that all of the users of a non-constant load are from the
6880 // BUILD_VECTOR node.
6881 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6882 return SDValue();
6883
6884 unsigned ScalarSize = Ld.getValueSizeInBits();
6885 bool IsGE256 = (VT.getSizeInBits() >= 256);
6886
6887 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6888 // instruction to save 8 or more bytes of constant pool data.
6889 // TODO: If multiple splats are generated to load the same constant,
6890 // it may be detrimental to overall size. There needs to be a way to detect
6891 // that condition to know if this is truly a size win.
6892 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6893
6894 // Handle broadcasting a single constant scalar from the constant pool
6895 // into a vector.
6896 // On Sandybridge (no AVX2), it is still better to load a constant vector
6897 // from the constant pool and not to broadcast it from a scalar.
6898 // But override that restriction when optimizing for size.
6899 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6900 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6901 EVT CVT = Ld.getValueType();
6902 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6902, __extension__ __PRETTY_FUNCTION__))
;
6903
6904 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6905 // For size optimization, also splat v2f64 and v2i64, and for size opt
6906 // with AVX2, also splat i8 and i16.
6907 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6908 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6909 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6910 const Constant *C = nullptr;
6911 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6912 C = CI->getConstantIntValue();
6913 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6914 C = CF->getConstantFPValue();
6915
6916 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6916, __extension__ __PRETTY_FUNCTION__))
;
6917
6918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6919 SDValue CP =
6920 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6921 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6922 Ld = DAG.getLoad(
6923 CVT, dl, DAG.getEntryNode(), CP,
6924 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6925 Alignment);
6926
6927 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6928 }
6929 }
6930
6931 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6932
6933 // Handle AVX2 in-register broadcasts.
6934 if (!IsLoad && Subtarget.hasInt256() &&
6935 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6936 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6937
6938 // The scalar source must be a normal load.
6939 if (!IsLoad)
6940 return SDValue();
6941
6942 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6943 (Subtarget.hasVLX() && ScalarSize == 64))
6944 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6945
6946 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6947 // double since there is no vbroadcastsd xmm
6948 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6949 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6950 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6951 }
6952
6953 // Unsupported broadcast.
6954 return SDValue();
6955}
6956
6957/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6958/// underlying vector and index.
6959///
6960/// Modifies \p ExtractedFromVec to the real vector and returns the real
6961/// index.
6962static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6963 SDValue ExtIdx) {
6964 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6965 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6966 return Idx;
6967
6968 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6969 // lowered this:
6970 // (extract_vector_elt (v8f32 %1), Constant<6>)
6971 // to:
6972 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6973 // (extract_subvector (v8f32 %0), Constant<4>),
6974 // undef)
6975 // Constant<0>)
6976 // In this case the vector is the extract_subvector expression and the index
6977 // is 2, as specified by the shuffle.
6978 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6979 SDValue ShuffleVec = SVOp->getOperand(0);
6980 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6981 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6982, __extension__ __PRETTY_FUNCTION__))
6982 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 6982, __extension__ __PRETTY_FUNCTION__))
;
6983
6984 int ShuffleIdx = SVOp->getMaskElt(Idx);
6985 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6986 ExtractedFromVec = ShuffleVec;
6987 return ShuffleIdx;
6988 }
6989 return Idx;
6990}
6991
6992static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6993 MVT VT = Op.getSimpleValueType();
6994
6995 // Skip if insert_vec_elt is not supported.
6996 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6997 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6998 return SDValue();
6999
7000 SDLoc DL(Op);
7001 unsigned NumElems = Op.getNumOperands();
7002
7003 SDValue VecIn1;
7004 SDValue VecIn2;
7005 SmallVector<unsigned, 4> InsertIndices;
7006 SmallVector<int, 8> Mask(NumElems, -1);
7007
7008 for (unsigned i = 0; i != NumElems; ++i) {
7009 unsigned Opc = Op.getOperand(i).getOpcode();
7010
7011 if (Opc == ISD::UNDEF)
7012 continue;
7013
7014 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7015 // Quit if more than 1 elements need inserting.
7016 if (InsertIndices.size() > 1)
7017 return SDValue();
7018
7019 InsertIndices.push_back(i);
7020 continue;
7021 }
7022
7023 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7024 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7025
7026 // Quit if non-constant index.
7027 if (!isa<ConstantSDNode>(ExtIdx))
7028 return SDValue();
7029 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7030
7031 // Quit if extracted from vector of different type.
7032 if (ExtractedFromVec.getValueType() != VT)
7033 return SDValue();
7034
7035 if (!VecIn1.getNode())
7036 VecIn1 = ExtractedFromVec;
7037 else if (VecIn1 != ExtractedFromVec) {
7038 if (!VecIn2.getNode())
7039 VecIn2 = ExtractedFromVec;
7040 else if (VecIn2 != ExtractedFromVec)
7041 // Quit if more than 2 vectors to shuffle
7042 return SDValue();
7043 }
7044
7045 if (ExtractedFromVec == VecIn1)
7046 Mask[i] = Idx;
7047 else if (ExtractedFromVec == VecIn2)
7048 Mask[i] = Idx + NumElems;
7049 }
7050
7051 if (!VecIn1.getNode())
7052 return SDValue();
7053
7054 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7055 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7056
7057 for (unsigned Idx : InsertIndices)
7058 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7059 DAG.getIntPtrConstant(Idx, DL));
7060
7061 return NV;
7062}
7063
7064static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7065 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__))
7066 Op.getScalarValueSizeInBits() == 1 &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__))
7067 "Can not convert non-constant vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector") ? void (0) : __assert_fail
("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __extension__ __PRETTY_FUNCTION__))
;
7068 uint64_t Immediate = 0;
7069 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7070 SDValue In = Op.getOperand(idx);
7071 if (!In.isUndef())
7072 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7073 }
7074 SDLoc dl(Op);
7075 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7076 return DAG.getConstant(Immediate, dl, VT);
7077}
7078// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7079SDValue
7080X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7081
7082 MVT VT = Op.getSimpleValueType();
7083 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7084, __extension__ __PRETTY_FUNCTION__))
7084 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7084, __extension__ __PRETTY_FUNCTION__))
;
7085
7086 SDLoc dl(Op);
7087 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7088 return Op;
7089
7090 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7091 return Op;
7092
7093 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7094 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7095 // Split the pieces.
7096 SDValue Lower =
7097 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7098 SDValue Upper =
7099 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7100 // We have to manually lower both halves so getNode doesn't try to
7101 // reassemble the build_vector.
7102 Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
7103 Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
7104 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7105 }
7106 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7107 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7108 return DAG.getBitcast(VT, Imm);
7109 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7110 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7111 DAG.getIntPtrConstant(0, dl));
7112 }
7113
7114 // Vector has one or more non-const elements
7115 uint64_t Immediate = 0;
7116 SmallVector<unsigned, 16> NonConstIdx;
7117 bool IsSplat = true;
7118 bool HasConstElts = false;
7119 int SplatIdx = -1;
7120 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7121 SDValue In = Op.getOperand(idx);
7122 if (In.isUndef())
7123 continue;
7124 if (!isa<ConstantSDNode>(In))
7125 NonConstIdx.push_back(idx);
7126 else {
7127 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7128 HasConstElts = true;
7129 }
7130 if (SplatIdx < 0)
7131 SplatIdx = idx;
7132 else if (In != Op.getOperand(SplatIdx))
7133 IsSplat = false;
7134 }
7135
7136 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7137 if (IsSplat)
7138 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7139 DAG.getConstant(1, dl, VT),
7140 DAG.getConstant(0, dl, VT));
7141
7142 // insert elements one by one
7143 SDValue DstVec;
7144 SDValue Imm;
7145 if (Immediate) {
7146 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7147 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7148 }
7149 else if (HasConstElts)
7150 Imm = DAG.getConstant(0, dl, VT);
7151 else
7152 Imm = DAG.getUNDEF(VT);
7153 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7154 DstVec = DAG.getBitcast(VT, Imm);
7155 else {
7156 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7157 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7158 DAG.getIntPtrConstant(0, dl));
7159 }
7160
7161 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7162 unsigned InsertIdx = NonConstIdx[i];
7163 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7164 Op.getOperand(InsertIdx),
7165 DAG.getIntPtrConstant(InsertIdx, dl));
7166 }
7167 return DstVec;
7168}
7169
7170/// \brief Return true if \p N implements a horizontal binop and return the
7171/// operands for the horizontal binop into V0 and V1.
7172///
7173/// This is a helper function of LowerToHorizontalOp().
7174/// This function checks that the build_vector \p N in input implements a
7175/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7176/// operation to match.
7177/// For example, if \p Opcode is equal to ISD::ADD, then this function
7178/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7179/// is equal to ISD::SUB, then this function checks if this is a horizontal
7180/// arithmetic sub.
7181///
7182/// This function only analyzes elements of \p N whose indices are
7183/// in range [BaseIdx, LastIdx).
7184static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7185 SelectionDAG &DAG,
7186 unsigned BaseIdx, unsigned LastIdx,
7187 SDValue &V0, SDValue &V1) {
7188 EVT VT = N->getValueType(0);
7189
7190 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7190, __extension__ __PRETTY_FUNCTION__))
;
7191 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7192, __extension__ __PRETTY_FUNCTION__))
7192 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7192, __extension__ __PRETTY_FUNCTION__))
;
7193
7194 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7195 bool CanFold = true;
7196 unsigned ExpectedVExtractIdx = BaseIdx;
7197 unsigned NumElts = LastIdx - BaseIdx;
7198 V0 = DAG.getUNDEF(VT);
7199 V1 = DAG.getUNDEF(VT);
7200
7201 // Check if N implements a horizontal binop.
7202 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7203 SDValue Op = N->getOperand(i + BaseIdx);
7204
7205 // Skip UNDEFs.
7206 if (Op->isUndef()) {
7207 // Update the expected vector extract index.
7208 if (i * 2 == NumElts)
7209 ExpectedVExtractIdx = BaseIdx;
7210 ExpectedVExtractIdx += 2;
7211 continue;
7212 }
7213
7214 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7215
7216 if (!CanFold)
7217 break;
7218
7219 SDValue Op0 = Op.getOperand(0);
7220 SDValue Op1 = Op.getOperand(1);
7221
7222 // Try to match the following pattern:
7223 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7224 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7225 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7226 Op0.getOperand(0) == Op1.getOperand(0) &&
7227 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7228 isa<ConstantSDNode>(Op1.getOperand(1)));
7229 if (!CanFold)
7230 break;
7231
7232 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7233 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7234
7235 if (i * 2 < NumElts) {
7236 if (V0.isUndef()) {
7237 V0 = Op0.getOperand(0);
7238 if (V0.getValueType() != VT)
7239 return false;
7240 }
7241 } else {
7242 if (V1.isUndef()) {
7243 V1 = Op0.getOperand(0);
7244 if (V1.getValueType() != VT)
7245 return false;
7246 }
7247 if (i * 2 == NumElts)
7248 ExpectedVExtractIdx = BaseIdx;
7249 }
7250
7251 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7252 if (I0 == ExpectedVExtractIdx)
7253 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7254 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7255 // Try to match the following dag sequence:
7256 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7257 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7258 } else
7259 CanFold = false;
7260
7261 ExpectedVExtractIdx += 2;
7262 }
7263
7264 return CanFold;
7265}
7266
7267/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7268/// a concat_vector.
7269///
7270/// This is a helper function of LowerToHorizontalOp().
7271/// This function expects two 256-bit vectors called V0 and V1.
7272/// At first, each vector is split into two separate 128-bit vectors.
7273/// Then, the resulting 128-bit vectors are used to implement two
7274/// horizontal binary operations.
7275///
7276/// The kind of horizontal binary operation is defined by \p X86Opcode.
7277///
7278/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7279/// the two new horizontal binop.
7280/// When Mode is set, the first horizontal binop dag node would take as input
7281/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7282/// horizontal binop dag node would take as input the lower 128-bit of V1
7283/// and the upper 128-bit of V1.
7284/// Example:
7285/// HADD V0_LO, V0_HI
7286/// HADD V1_LO, V1_HI
7287///
7288/// Otherwise, the first horizontal binop dag node takes as input the lower
7289/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7290/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7291/// Example:
7292/// HADD V0_LO, V1_LO
7293/// HADD V0_HI, V1_HI
7294///
7295/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7296/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7297/// the upper 128-bits of the result.
7298static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7299 const SDLoc &DL, SelectionDAG &DAG,
7300 unsigned X86Opcode, bool Mode,
7301 bool isUndefLO, bool isUndefHI) {
7302 MVT VT = V0.getSimpleValueType();
7303 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __extension__ __PRETTY_FUNCTION__))
7304 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __extension__ __PRETTY_FUNCTION__))
;
7305
7306 unsigned NumElts = VT.getVectorNumElements();
7307 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7308 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7309 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7310 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7311 MVT NewVT = V0_LO.getSimpleValueType();
7312
7313 SDValue LO = DAG.getUNDEF(NewVT);
7314 SDValue HI = DAG.getUNDEF(NewVT);
7315
7316 if (Mode) {
7317 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7318 if (!isUndefLO && !V0->isUndef())
7319 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7320 if (!isUndefHI && !V1->isUndef())
7321 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7322 } else {
7323 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7324 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7325 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7326
7327 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7328 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7329 }
7330
7331 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7332}
7333
7334/// Returns true iff \p BV builds a vector with the result equivalent to
7335/// the result of ADDSUB operation.
7336/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7337/// are written to the parameters \p Opnd0 and \p Opnd1.
7338static bool isAddSub(const BuildVectorSDNode *BV,
7339 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7340 SDValue &Opnd0, SDValue &Opnd1) {
7341
7342 MVT VT = BV->getSimpleValueType(0);
7343 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7344 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7345 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7346 return false;
7347
7348 unsigned NumElts = VT.getVectorNumElements();
7349 SDValue InVec0 = DAG.getUNDEF(VT);
7350 SDValue InVec1 = DAG.getUNDEF(VT);
7351
7352 // Odd-numbered elements in the input build vector are obtained from
7353 // adding two integer/float elements.
7354 // Even-numbered elements in the input build vector are obtained from
7355 // subtracting two integer/float elements.
7356 unsigned ExpectedOpcode = ISD::FSUB;
7357 unsigned NextExpectedOpcode = ISD::FADD;
7358 bool AddFound = false;
7359 bool SubFound = false;
7360
7361 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7362 SDValue Op = BV->getOperand(i);
7363
7364 // Skip 'undef' values.
7365 unsigned Opcode = Op.getOpcode();
7366 if (Opcode == ISD::UNDEF) {
7367 std::swap(ExpectedOpcode, NextExpectedOpcode);
7368 continue;
7369 }
7370
7371 // Early exit if we found an unexpected opcode.
7372 if (Opcode != ExpectedOpcode)
7373 return false;
7374
7375 SDValue Op0 = Op.getOperand(0);
7376 SDValue Op1 = Op.getOperand(1);
7377
7378 // Try to match the following pattern:
7379 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7380 // Early exit if we cannot match that sequence.
7381 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7382 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7383 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7384 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7385 Op0.getOperand(1) != Op1.getOperand(1))
7386 return false;
7387
7388 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7389 if (I0 != i)
7390 return false;
7391
7392 // We found a valid add/sub node. Update the information accordingly.
7393 if (i & 1)
7394 AddFound = true;
7395 else
7396 SubFound = true;
7397
7398 // Update InVec0 and InVec1.
7399 if (InVec0.isUndef()) {
7400 InVec0 = Op0.getOperand(0);
7401 if (InVec0.getSimpleValueType() != VT)
7402 return false;
7403 }
7404 if (InVec1.isUndef()) {
7405 InVec1 = Op1.getOperand(0);
7406 if (InVec1.getSimpleValueType() != VT)
7407 return false;
7408 }
7409
7410 // Make sure that operands in input to each add/sub node always
7411 // come from a same pair of vectors.
7412 if (InVec0 != Op0.getOperand(0)) {
7413 if (ExpectedOpcode == ISD::FSUB)
7414 return false;
7415
7416 // FADD is commutable. Try to commute the operands
7417 // and then test again.
7418 std::swap(Op0, Op1);
7419 if (InVec0 != Op0.getOperand(0))
7420 return false;
7421 }
7422
7423 if (InVec1 != Op1.getOperand(0))
7424 return false;
7425
7426 // Update the pair of expected opcodes.
7427 std::swap(ExpectedOpcode, NextExpectedOpcode);
7428 }
7429
7430 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7431 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7432 return false;
7433
7434 Opnd0 = InVec0;
7435 Opnd1 = InVec1;
7436 return true;
7437}
7438
7439/// Returns true if is possible to fold MUL and an idiom that has already been
7440/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7441/// If (and only if) true is returned, the operands of FMADDSUB are written to
7442/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7443///
7444/// Prior to calling this function it should be known that there is some
7445/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7446/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7447/// before replacement of such SDNode with ADDSUB operation. Thus the number
7448/// of \p Opnd0 uses is expected to be equal to 2.
7449/// For example, this function may be called for the following IR:
7450/// %AB = fmul fast <2 x double> %A, %B
7451/// %Sub = fsub fast <2 x double> %AB, %C
7452/// %Add = fadd fast <2 x double> %AB, %C
7453/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7454/// <2 x i32> <i32 0, i32 3>
7455/// There is a def for %Addsub here, which potentially can be replaced by
7456/// X86ISD::ADDSUB operation:
7457/// %Addsub = X86ISD::ADDSUB %AB, %C
7458/// and such ADDSUB can further be replaced with FMADDSUB:
7459/// %Addsub = FMADDSUB %A, %B, %C.
7460///
7461/// The main reason why this method is called before the replacement of the
7462/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7463/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7464/// FMADDSUB is.
7465static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7466 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7467 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7468 !Subtarget.hasAnyFMA())
7469 return false;
7470
7471 // FIXME: These checks must match the similar ones in
7472 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7473 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7474 // or MUL + ADDSUB to FMADDSUB.
7475 const TargetOptions &Options = DAG.getTarget().Options;
7476 bool AllowFusion =
7477 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7478 if (!AllowFusion)
7479 return false;
7480
7481 Opnd2 = Opnd1;
7482 Opnd1 = Opnd0.getOperand(1);
7483 Opnd0 = Opnd0.getOperand(0);
7484
7485 return true;
7486}
7487
7488/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7489/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7490static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7491 const X86Subtarget &Subtarget,
7492 SelectionDAG &DAG) {
7493 SDValue Opnd0, Opnd1;
7494 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7495 return SDValue();
7496
7497 MVT VT = BV->getSimpleValueType(0);
7498 SDLoc DL(BV);
7499
7500 // Try to generate X86ISD::FMADDSUB node here.
7501 SDValue Opnd2;
7502 // TODO: According to coverage reports, the FMADDSUB transform is not
7503 // triggered by any tests.
7504 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7505 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7506
7507 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7508 // the ADDSUB idiom has been successfully recognized. There are no known
7509 // X86 targets with 512-bit ADDSUB instructions!
7510 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7511 // recognition.
7512 if (VT.is512BitVector())
7513 return SDValue();
7514
7515 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7516}
7517
7518/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7519static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7520 const X86Subtarget &Subtarget,
7521 SelectionDAG &DAG) {
7522 MVT VT = BV->getSimpleValueType(0);
7523 unsigned NumElts = VT.getVectorNumElements();
7524 unsigned NumUndefsLO = 0;
7525 unsigned NumUndefsHI = 0;
7526 unsigned Half = NumElts/2;
7527
7528 // Count the number of UNDEF operands in the build_vector in input.
7529 for (unsigned i = 0, e = Half; i != e; ++i)
7530 if (BV->getOperand(i)->isUndef())
7531 NumUndefsLO++;
7532
7533 for (unsigned i = Half, e = NumElts; i != e; ++i)
7534 if (BV->getOperand(i)->isUndef())
7535 NumUndefsHI++;
7536
7537 // Early exit if this is either a build_vector of all UNDEFs or all the
7538 // operands but one are UNDEF.
7539 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7540 return SDValue();
7541
7542 SDLoc DL(BV);
7543 SDValue InVec0, InVec1;
7544 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7545 // Try to match an SSE3 float HADD/HSUB.
7546 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7547 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7548
7549 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7550 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7551 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7552 // Try to match an SSSE3 integer HADD/HSUB.
7553 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7554 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7555
7556 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7557 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7558 }
7559
7560 if (!Subtarget.hasAVX())
7561 return SDValue();
7562
7563 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7564 // Try to match an AVX horizontal add/sub of packed single/double
7565 // precision floating point values from 256-bit vectors.
7566 SDValue InVec2, InVec3;
7567 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7568 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7569 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7570 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7571 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7572
7573 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7574 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7575 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7576 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7577 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7578 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7579 // Try to match an AVX2 horizontal add/sub of signed integers.
7580 SDValue InVec2, InVec3;
7581 unsigned X86Opcode;
7582 bool CanFold = true;
7583
7584 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7585 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7586 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7587 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7588 X86Opcode = X86ISD::HADD;
7589 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7590 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7591 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7592 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7593 X86Opcode = X86ISD::HSUB;
7594 else
7595 CanFold = false;
7596
7597 if (CanFold) {
7598 // Fold this build_vector into a single horizontal add/sub.
7599 // Do this only if the target has AVX2.
7600 if (Subtarget.hasAVX2())
7601 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7602
7603 // Do not try to expand this build_vector into a pair of horizontal
7604 // add/sub if we can emit a pair of scalar add/sub.
7605 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7606 return SDValue();
7607
7608 // Convert this build_vector into a pair of horizontal binop followed by
7609 // a concat vector.
7610 bool isUndefLO = NumUndefsLO == Half;
7611 bool isUndefHI = NumUndefsHI == Half;
7612 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7613 isUndefLO, isUndefHI);
7614 }
7615 }
7616
7617 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7618 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7619 unsigned X86Opcode;
7620 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7621 X86Opcode = X86ISD::HADD;
7622 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7623 X86Opcode = X86ISD::HSUB;
7624 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7625 X86Opcode = X86ISD::FHADD;
7626 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7627 X86Opcode = X86ISD::FHSUB;
7628 else
7629 return SDValue();
7630
7631 // Don't try to expand this build_vector into a pair of horizontal add/sub
7632 // if we can simply emit a pair of scalar add/sub.
7633 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7634 return SDValue();
7635
7636 // Convert this build_vector into two horizontal add/sub followed by
7637 // a concat vector.
7638 bool isUndefLO = NumUndefsLO == Half;
7639 bool isUndefHI = NumUndefsHI == Half;
7640 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7641 isUndefLO, isUndefHI);
7642 }
7643
7644 return SDValue();
7645}
7646
7647/// If a BUILD_VECTOR's source elements all apply the same bit operation and
7648/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7649/// just apply the bit to the vectors.
7650/// NOTE: Its not in our interest to start make a general purpose vectorizer
7651/// from this, but enough scalar bit operations are created from the later
7652/// legalization + scalarization stages to need basic support.
7653static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7654 SelectionDAG &DAG) {
7655 SDLoc DL(Op);
7656 MVT VT = Op->getSimpleValueType(0);
7657 unsigned NumElems = VT.getVectorNumElements();
7658 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7659
7660 // Check that all elements have the same opcode.
7661 // TODO: Should we allow UNDEFS and if so how many?
7662 unsigned Opcode = Op->getOperand(0).getOpcode();
7663 for (unsigned i = 1; i < NumElems; ++i)
7664 if (Opcode != Op->getOperand(i).getOpcode())
7665 return SDValue();
7666
7667 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7668 switch (Opcode) {
7669 default:
7670 return SDValue();
7671 case ISD::AND:
7672 case ISD::XOR:
7673 case ISD::OR:
7674 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7675 return SDValue();
7676 break;
7677 }
7678
7679 SmallVector<SDValue, 4> LHSElts, RHSElts;
7680 for (SDValue Elt : Op->ops()) {
7681 SDValue LHS = Elt.getOperand(0);
7682 SDValue RHS = Elt.getOperand(1);
7683
7684 // We expect the canonicalized RHS operand to be the constant.
7685 if (!isa<ConstantSDNode>(RHS))
7686 return SDValue();
7687 LHSElts.push_back(LHS);
7688 RHSElts.push_back(RHS);
7689 }
7690
7691 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7692 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7693 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7694}
7695
7696/// Create a vector constant without a load. SSE/AVX provide the bare minimum
7697/// functionality to do this, so it's all zeros, all ones, or some derivation
7698/// that is cheap to calculate.
7699static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7700 const X86Subtarget &Subtarget) {
7701 SDLoc DL(Op);
7702 MVT VT = Op.getSimpleValueType();
7703
7704 // Vectors containing all zeros can be matched by pxor and xorps.
7705 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7706 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7707 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7708 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7709 return Op;
7710
7711 return getZeroVector(VT, Subtarget, DAG, DL);
7712 }
7713
7714 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7715 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7716 // vpcmpeqd on 256-bit vectors.
7717 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7718 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7719 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7720 return Op;
7721
7722 return getOnesVector(VT, DAG, DL);
7723 }
7724
7725 return SDValue();
7726}
7727
7728// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7729// reasoned to be a permutation of a vector by indices in a non-constant vector.
7730// (build_vector (extract_elt V, (extract_elt I, 0)),
7731// (extract_elt V, (extract_elt I, 1)),
7732// ...
7733// ->
7734// (vpermv I, V)
7735//
7736// TODO: Handle undefs
7737// TODO: Utilize pshufb and zero mask blending to support more efficient
7738// construction of vectors with constant-0 elements.
7739// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
7740// when no native operation available.
7741static SDValue
7742LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7743 const X86Subtarget &Subtarget) {
7744 // Look for VPERMV and PSHUFB opportunities.
7745 MVT VT = V.getSimpleValueType();
7746 switch (VT.SimpleTy) {
7747 default:
7748 return SDValue();
7749 case MVT::v16i8:
7750 if (!Subtarget.hasSSE3())
7751 return SDValue();
7752 break;
7753 case MVT::v8f32:
7754 case MVT::v8i32:
7755 if (!Subtarget.hasAVX2())
7756 return SDValue();
7757 break;
7758 case MVT::v4i64:
7759 case MVT::v4f64:
7760 if (!Subtarget.hasVLX())
7761 return SDValue();
7762 break;
7763 case MVT::v16f32:
7764 case MVT::v8f64:
7765 case MVT::v16i32:
7766 case MVT::v8i64:
7767 if (!Subtarget.hasAVX512())
7768 return SDValue();
7769 break;
7770 case MVT::v32i16:
7771 if (!Subtarget.hasBWI())
7772 return SDValue();
7773 break;
7774 case MVT::v8i16:
7775 case MVT::v16i16:
7776 if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
7777 return SDValue();
7778 break;
7779 case MVT::v64i8:
7780 if (!Subtarget.hasVBMI())
7781 return SDValue();
7782 break;
7783 case MVT::v32i8:
7784 if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
7785 return SDValue();
7786 break;
7787 }
7788 SDValue SrcVec, IndicesVec;
7789 // Check for a match of the permute source vector and permute index elements.
7790 // This is done by checking that the i-th build_vector operand is of the form:
7791 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7792 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7793 SDValue Op = V.getOperand(Idx);
7794 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7795 return SDValue();
7796
7797 // If this is the first extract encountered in V, set the source vector,
7798 // otherwise verify the extract is from the previously defined source
7799 // vector.
7800 if (!SrcVec)
7801 SrcVec = Op.getOperand(0);
7802 else if (SrcVec != Op.getOperand(0))
7803 return SDValue();
7804 SDValue ExtractedIndex = Op->getOperand(1);
7805 // Peek through extends.
7806 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7807 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7808 ExtractedIndex = ExtractedIndex.getOperand(0);
7809 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7810 return SDValue();
7811
7812 // If this is the first extract from the index vector candidate, set the
7813 // indices vector, otherwise verify the extract is from the previously
7814 // defined indices vector.
7815 if (!IndicesVec)
7816 IndicesVec = ExtractedIndex.getOperand(0);
7817 else if (IndicesVec != ExtractedIndex.getOperand(0))
7818 return SDValue();
7819
7820 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7821 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7822 return SDValue();
7823 }
7824 MVT IndicesVT = VT;
7825 if (VT.isFloatingPoint())
7826 IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
7827 VT.getVectorNumElements());
7828 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7829 return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
7830 SDLoc(V), VT, IndicesVec, SrcVec);
7831}
7832
7833SDValue
7834X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7835 SDLoc dl(Op);
7836
7837 MVT VT = Op.getSimpleValueType();
7838 MVT ExtVT = VT.getVectorElementType();
7839 unsigned NumElems = Op.getNumOperands();
7840
7841 // Generate vectors for predicate vectors.
7842 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7843 return LowerBUILD_VECTORvXi1(Op, DAG);
7844
7845 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7846 return VectorConstant;
7847
7848 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7849 // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
7850 // transform here.
7851 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7852 return AddSub;
7853 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7854 return HorizontalOp;
7855 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7856 return Broadcast;
7857 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7858 return BitOp;
7859
7860 unsigned EVTBits = ExtVT.getSizeInBits();
7861
7862 unsigned NumZero = 0;
7863 unsigned NumNonZero = 0;
7864 uint64_t NonZeros = 0;
7865 bool IsAllConstants = true;
7866 SmallSet<SDValue, 8> Values;
7867 unsigned NumConstants = NumElems;
7868 for (unsigned i = 0; i < NumElems; ++i) {
7869 SDValue Elt = Op.getOperand(i);
7870 if (Elt.isUndef())
7871 continue;
7872 Values.insert(Elt);
7873 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
7874 IsAllConstants = false;
7875 NumConstants--;
7876 }
7877 if (X86::isZeroNode(Elt))
7878 NumZero++;
7879 else {
7880 assert(i < sizeof(NonZeros) * 8)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7880, __extension__ __PRETTY_FUNCTION__))
; // Make sure the shift is within range.
7881 NonZeros |= ((uint64_t)1 << i);
7882 NumNonZero++;
7883 }
7884 }
7885
7886 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7887 if (NumNonZero == 0)
7888 return DAG.getUNDEF(VT);
7889
7890 // If we are inserting one variable into a vector of non-zero constants, try
7891 // to avoid loading each constant element as a scalar. Load the constants as a
7892 // vector and then insert the variable scalar element. If insertion is not
7893 // supported, we assume that we will fall back to a shuffle to get the scalar
7894 // blended with the constants. Insertion into a zero vector is handled as a
7895 // special-case somewhere below here.
7896 LLVMContext &Context = *DAG.getContext();
7897 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
7898 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
7899 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
7900 // Create an all-constant vector. The variable element in the old
7901 // build vector is replaced by undef in the constant vector. Save the
7902 // variable scalar element and its index for use in the insertelement.
7903 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
7904 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
7905 SDValue VarElt;
7906 SDValue InsIndex;
7907 for (unsigned i = 0; i != NumElems; ++i) {
7908 SDValue Elt = Op.getOperand(i);
7909 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
7910 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
7911 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
7912 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
7913 else if (!Elt.isUndef()) {
7914 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7915, __extension__ __PRETTY_FUNCTION__))
7915 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7915, __extension__ __PRETTY_FUNCTION__))
;
7916 VarElt = Elt;
7917 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
7918 }
7919 }
7920 Constant *CV = ConstantVector::get(ConstVecOps);
7921 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
7922
7923 // The constants we just created may not be legal (eg, floating point). We
7924 // must lower the vector right here because we can not guarantee that we'll
7925 // legalize it before loading it. This is also why we could not just create
7926 // a new build vector here. If the build vector contains illegal constants,
7927 // it could get split back up into a series of insert elements.
7928 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
7929 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
7930 MachineFunction &MF = DAG.getMachineFunction();
7931 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
7932 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
7933 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
7934 }
7935
7936 // Special case for single non-zero, non-undef, element.
7937 if (NumNonZero == 1) {
7938 unsigned Idx = countTrailingZeros(NonZeros);
7939 SDValue Item = Op.getOperand(Idx);
7940
7941 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7942 // the value are obviously zero, truncate the value to i32 and do the
7943 // insertion that way. Only do this if the value is non-constant or if the
7944 // value is a constant being inserted into element 0. It is cheaper to do
7945 // a constant pool load than it is to do a movd + shuffle.
7946 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7947 (!IsAllConstants || Idx == 0)) {
7948 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7949 // Handle SSE only.
7950 assert(VT == MVT::v2i64 && "Expected an SSE value type!")(static_cast <bool> (VT == MVT::v2i64 && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("VT == MVT::v2i64 && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7950, __extension__ __PRETTY_FUNCTION__))
;
7951 MVT VecVT = MVT::v4i32;
7952
7953 // Truncate the value (which may itself be a constant) to i32, and
7954 // convert it to a vector with movd (S2V+shuffle to zero extend).
7955 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7956 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7957 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7958 Item, Idx * 2, true, Subtarget, DAG));
7959 }
7960 }
7961
7962 // If we have a constant or non-constant insertion into the low element of
7963 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7964 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7965 // depending on what the source datatype is.
7966 if (Idx == 0) {
7967 if (NumZero == 0)
7968 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7969
7970 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7971 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7972 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__))
7973 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__))
7974 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7974, __extension__ __PRETTY_FUNCTION__))
;
7975 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7976 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7977 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7978 }
7979
7980 // We can't directly insert an i8 or i16 into a vector, so zero extend
7981 // it to i32 first.
7982 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7983 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7984 if (VT.getSizeInBits() >= 256) {
7985 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7986 if (Subtarget.hasAVX()) {
7987 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7988 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7989 } else {
7990 // Without AVX, we need to extend to a 128-bit vector and then
7991 // insert into the 256-bit vector.
7992 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7993 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7994 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7995 }
7996 } else {
7997 assert(VT.is128BitVector() && "Expected an SSE value type!")(static_cast <bool> (VT.is128BitVector() && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 7997, __extension__ __PRETTY_FUNCTION__))
;
7998 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7999 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8000 }
8001 return DAG.getBitcast(VT, Item);
8002 }
8003 }
8004
8005 // Is it a vector logical left shift?
8006 if (NumElems == 2 && Idx == 1 &&
8007 X86::isZeroNode(Op.getOperand(0)) &&
8008 !X86::isZeroNode(Op.getOperand(1))) {
8009 unsigned NumBits = VT.getSizeInBits();
8010 return getVShift(true, VT,
8011 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8012 VT, Op.getOperand(1)),
8013 NumBits/2, DAG, *this, dl);
8014 }
8015
8016 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8017 return SDValue();
8018
8019 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8020 // is a non-constant being inserted into an element other than the low one,
8021 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8022 // movd/movss) to move this into the low element, then shuffle it into
8023 // place.
8024 if (EVTBits == 32) {
8025 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8026 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8027 }
8028 }
8029
8030 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8031 if (Values.size() == 1) {
8032 if (EVTBits == 32) {
8033 // Instead of a shuffle like this:
8034 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8035 // Check if it's possible to issue this instead.
8036 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8037 unsigned Idx = countTrailingZeros(NonZeros);
8038 SDValue Item = Op.getOperand(Idx);
8039 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8040 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8041 }
8042 return SDValue();
8043 }
8044
8045 // A vector full of immediates; various special cases are already
8046 // handled, so this is best done with a single constant-pool load.
8047 if (IsAllConstants)
8048 return SDValue();
8049
8050 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8051 return V;
8052
8053 // See if we can use a vector load to get all of the elements.
8054 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
8055 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8056 if (SDValue LD =
8057 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8058 return LD;
8059 }
8060
8061 // For AVX-length vectors, build the individual 128-bit pieces and use
8062 // shuffles to put them in place.
8063 if (VT.is256BitVector() || VT.is512BitVector()) {
8064 EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
8065
8066 // Build both the lower and upper subvector.
8067 SDValue Lower =
8068 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8069 SDValue Upper = DAG.getBuildVector(
8070 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8071
8072 // Recreate the wider vector with the lower and upper part.
8073 if (VT.is256BitVector())
8074 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8075 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8076 }
8077
8078 // Let legalizer expand 2-wide build_vectors.
8079 if (EVTBits == 64) {
8080 if (NumNonZero == 1) {
8081 // One half is zero or undef.
8082 unsigned Idx = countTrailingZeros(NonZeros);
8083 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8084 Op.getOperand(Idx));
8085 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8086 }
8087 return SDValue();
8088 }
8089
8090 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8091 if (EVTBits == 8 && NumElems == 16)
8092 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8093 DAG, Subtarget))
8094 return V;
8095
8096 if (EVTBits == 16 && NumElems == 8)
8097 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8098 DAG, Subtarget))
8099 return V;
8100
8101 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8102 if (EVTBits == 32 && NumElems == 4)
8103 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8104 return V;
8105
8106 // If element VT is == 32 bits, turn it into a number of shuffles.
8107 if (NumElems == 4 && NumZero > 0) {
8108 SmallVector<SDValue, 8> Ops(NumElems);
8109 for (unsigned i = 0; i < 4; ++i) {
8110 bool isZero = !(NonZeros & (1ULL << i));
8111 if (isZero)
8112 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8113 else
8114 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8115 }
8116
8117 for (unsigned i = 0; i < 2; ++i) {
8118 switch ((NonZeros >> (i*2)) & 0x3) {
8119 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8119)
;
8120 case 0:
8121 Ops[i] = Ops[i*2]; // Must be a zero vector.
8122 break;
8123 case 1:
8124 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8125 break;
8126 case 2:
8127 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8128 break;
8129 case 3:
8130 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8131 break;
8132 }
8133 }
8134
8135 bool Reverse1 = (NonZeros & 0x3) == 2;
8136 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8137 int MaskVec[] = {
8138 Reverse1 ? 1 : 0,
8139 Reverse1 ? 0 : 1,
8140 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8141 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8142 };
8143 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8144 }
8145
8146 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8146, __extension__ __PRETTY_FUNCTION__))
;
8147
8148 // Check for a build vector from mostly shuffle plus few inserting.
8149 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8150 return Sh;
8151
8152 // For SSE 4.1, use insertps to put the high elements into the low element.
8153 if (Subtarget.hasSSE41()) {
8154 SDValue Result;
8155 if (!Op.getOperand(0).isUndef())
8156 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8157 else
8158 Result = DAG.getUNDEF(VT);
8159
8160 for (unsigned i = 1; i < NumElems; ++i) {
8161 if (Op.getOperand(i).isUndef()) continue;
8162 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8163 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8164 }
8165 return Result;
8166 }
8167
8168 // Otherwise, expand into a number of unpckl*, start by extending each of
8169 // our (non-undef) elements to the full vector width with the element in the
8170 // bottom slot of the vector (which generates no code for SSE).
8171 SmallVector<SDValue, 8> Ops(NumElems);
8172 for (unsigned i = 0; i < NumElems; ++i) {
8173 if (!Op.getOperand(i).isUndef())
8174 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8175 else
8176 Ops[i] = DAG.getUNDEF(VT);
8177 }
8178
8179 // Next, we iteratively mix elements, e.g. for v4f32:
8180 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8181 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8182 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8183 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8184 // Generate scaled UNPCKL shuffle mask.
8185 SmallVector<int, 16> Mask;
8186 for(unsigned i = 0; i != Scale; ++i)
8187 Mask.push_back(i);
8188 for (unsigned i = 0; i != Scale; ++i)
8189 Mask.push_back(NumElems+i);
8190 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8191
8192 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8193 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8194 }
8195 return Ops[0];
8196}
8197
8198// 256-bit AVX can use the vinsertf128 instruction
8199// to create 256-bit vectors from two other 128-bit ones.
8200static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8201 SDLoc dl(Op);
8202 MVT ResVT = Op.getSimpleValueType();
8203
8204 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8205, __extension__ __PRETTY_FUNCTION__))
8205 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8205, __extension__ __PRETTY_FUNCTION__))
;
8206
8207 SDValue V1 = Op.getOperand(0);
8208 SDValue V2 = Op.getOperand(1);
8209 unsigned NumElems = ResVT.getVectorNumElements();
8210 if (ResVT.is256BitVector())
8211 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8212
8213 if (Op.getNumOperands() == 4) {
8214 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8215 ResVT.getVectorNumElements()/2);
8216 SDValue V3 = Op.getOperand(2);
8217 SDValue V4 = Op.getOperand(3);
8218 return concat256BitVectors(
8219 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8220 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8221 NumElems, DAG, dl);
8222 }
8223 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8224}
8225
8226// Return true if all the operands of the given CONCAT_VECTORS node are zeros
8227// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8228static bool isExpandWithZeros(const SDValue &Op) {
8229 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&(static_cast <bool> (Op.getOpcode() == ISD::CONCAT_VECTORS
&& "Expand with zeros only possible in CONCAT_VECTORS nodes!"
) ? void (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8230, __extension__ __PRETTY_FUNCTION__))
8230 "Expand with zeros only possible in CONCAT_VECTORS nodes!")(static_cast <bool> (Op.getOpcode() == ISD::CONCAT_VECTORS
&& "Expand with zeros only possible in CONCAT_VECTORS nodes!"
) ? void (0) : __assert_fail ("Op.getOpcode() == ISD::CONCAT_VECTORS && \"Expand with zeros only possible in CONCAT_VECTORS nodes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8230, __extension__ __PRETTY_FUNCTION__))
;
8231
8232 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8233 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8234 return false;
8235
8236 return true;
8237}
8238
8239// Returns true if the given node is a type promotion (by concatenating i1
8240// zeros) of the result of a node that already zeros all upper bits of
8241// k-register.
8242static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8243 unsigned Opc = Op.getOpcode();
8244
8245 assert(Opc == ISD::CONCAT_VECTORS &&(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__))
8246 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__))
8247 "Unexpected node to check for type promotion!")(static_cast <bool> (Opc == ISD::CONCAT_VECTORS &&
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected node to check for type promotion!") ? void (0) :
__assert_fail ("Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected node to check for type promotion!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8247, __extension__ __PRETTY_FUNCTION__))
;
8248
8249 // As long as we are concatenating zeros to the upper part of a previous node
8250 // result, climb up the tree until a node with different opcode is
8251 // encountered
8252 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8253 if (Opc == ISD::INSERT_SUBVECTOR) {
8254 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8255 Op.getConstantOperandVal(2) == 0)
8256 Op = Op.getOperand(1);
8257 else
8258 return SDValue();
8259 } else { // Opc == ISD::CONCAT_VECTORS
8260 if (isExpandWithZeros(Op))
8261 Op = Op.getOperand(0);
8262 else
8263 return SDValue();
8264 }
8265 Opc = Op.getOpcode();
8266 }
8267
8268 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8269 // of a node that zeros the upper bits (its masked version).
8270 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8271 (Op.getOpcode() == ISD::AND &&
8272 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8273 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8274 return Op;
8275 }
8276
8277 return SDValue();
8278}
8279
8280static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8281 const X86Subtarget &Subtarget,
8282 SelectionDAG & DAG) {
8283 SDLoc dl(Op);
8284 MVT ResVT = Op.getSimpleValueType();
8285 unsigned NumOperands = Op.getNumOperands();
8286
8287 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8288, __extension__ __PRETTY_FUNCTION__))
8288 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8288, __extension__ __PRETTY_FUNCTION__))
;
8289
8290 // If this node promotes - by concatenating zeroes - the type of the result
8291 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8292 // output register, mark it as legal and catch the pattern in instruction
8293 // selection to avoid emitting extra instructions (for zeroing upper bits).
8294 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8295 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8296 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8297 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8298 ZeroC);
8299 }
8300
8301 unsigned NumZero = 0;
8302 unsigned NumNonZero = 0;
8303 uint64_t NonZeros = 0;
8304 for (unsigned i = 0; i != NumOperands; ++i) {
8305 SDValue SubVec = Op.getOperand(i);
8306 if (SubVec.isUndef())
8307 continue;
8308 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8309 ++NumZero;
8310 else {
8311 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8311, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
8312 NonZeros |= (uint64_t)1 << i;
8313 ++NumNonZero;
8314 }
8315 }
8316
8317
8318 // If there are zero or one non-zeros we can handle this very simply.
8319 if (NumNonZero <= 1) {
8320 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8321 : DAG.getUNDEF(ResVT);
8322 if (!NumNonZero)
8323 return Vec;
8324 unsigned Idx = countTrailingZeros(NonZeros);
8325 SDValue SubVec = Op.getOperand(Idx);
8326 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8327 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8328 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8329 }
8330
8331 if (NumOperands > 2) {
8332 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8333 ResVT.getVectorNumElements()/2);
8334 ArrayRef<SDUse> Ops = Op->ops();
8335 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8336 Ops.slice(0, NumOperands/2));
8337 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8338 Ops.slice(NumOperands/2));
8339 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8340 }
8341
8342 assert(NumNonZero == 2 && "Simple cases not handled?")(static_cast <bool> (NumNonZero == 2 && "Simple cases not handled?"
) ? void (0) : __assert_fail ("NumNonZero == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8342, __extension__ __PRETTY_FUNCTION__))
;
8343
8344 if (ResVT.getVectorNumElements() >= 16)
8345 return Op; // The operation is legal with KUNPCK
8346
8347 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8348 DAG.getUNDEF(ResVT), Op.getOperand(0),
8349 DAG.getIntPtrConstant(0, dl));
8350 unsigned NumElems = ResVT.getVectorNumElements();
8351 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8352 DAG.getIntPtrConstant(NumElems/2, dl));
8353}
8354
8355static SDValue LowerCONCAT_VECTORS(SDValue Op,
8356 const X86Subtarget &Subtarget,
8357 SelectionDAG &DAG) {
8358 MVT VT = Op.getSimpleValueType();
8359 if (VT.getVectorElementType() == MVT::i1)
8360 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8361
8362 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__))
8363 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__))
8364 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8364, __extension__ __PRETTY_FUNCTION__))
;
8365
8366 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8367 // from two other 128-bit ones.
8368
8369 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8370 return LowerAVXCONCAT_VECTORS(Op, DAG);
8371}
8372
8373//===----------------------------------------------------------------------===//
8374// Vector shuffle lowering
8375//
8376// This is an experimental code path for lowering vector shuffles on x86. It is
8377// designed to handle arbitrary vector shuffles and blends, gracefully
8378// degrading performance as necessary. It works hard to recognize idiomatic
8379// shuffles and lower them to optimal instruction patterns without leaving
8380// a framework that allows reasonably efficient handling of all vector shuffle
8381// patterns.
8382//===----------------------------------------------------------------------===//
8383
8384/// \brief Tiny helper function to identify a no-op mask.
8385///
8386/// This is a somewhat boring predicate function. It checks whether the mask
8387/// array input, which is assumed to be a single-input shuffle mask of the kind
8388/// used by the X86 shuffle instructions (not a fully general
8389/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8390/// in-place shuffle are 'no-op's.
8391static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8392 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8393 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8393, __extension__ __PRETTY_FUNCTION__))
;
8394 if (Mask[i] >= 0 && Mask[i] != i)
8395 return false;
8396 }
8397 return true;
8398}
8399
8400/// \brief Test whether there are elements crossing 128-bit lanes in this
8401/// shuffle mask.
8402///
8403/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8404/// and we routinely test for these.
8405static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8406 int LaneSize = 128 / VT.getScalarSizeInBits();
8407 int Size = Mask.size();
8408 for (int i = 0; i < Size; ++i)
8409 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8410 return true;
8411 return false;
8412}
8413
8414/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8415///
8416/// This checks a shuffle mask to see if it is performing the same
8417/// lane-relative shuffle in each sub-lane. This trivially implies
8418/// that it is also not lane-crossing. It may however involve a blend from the
8419/// same lane of a second vector.
8420///
8421/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8422/// non-trivial to compute in the face of undef lanes. The representation is
8423/// suitable for use with existing 128-bit shuffles as entries from the second
8424/// vector have been remapped to [LaneSize, 2*LaneSize).
8425static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8426 ArrayRef<int> Mask,
8427 SmallVectorImpl<int> &RepeatedMask) {
8428 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8429 RepeatedMask.assign(LaneSize, -1);
8430 int Size = Mask.size();
8431 for (int i = 0; i < Size; ++i) {
8432 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8432, __extension__ __PRETTY_FUNCTION__))
;
8433 if (Mask[i] < 0)
8434 continue;
8435 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8436 // This entry crosses lanes, so there is no way to model this shuffle.
8437 return false;
8438
8439 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8440 // Adjust second vector indices to start at LaneSize instead of Size.
8441 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8442 : Mask[i] % LaneSize + LaneSize;
8443 if (RepeatedMask[i % LaneSize] < 0)
8444 // This is the first non-undef entry in this slot of a 128-bit lane.
8445 RepeatedMask[i % LaneSize] = LocalM;
8446 else if (RepeatedMask[i % LaneSize] != LocalM)
8447 // Found a mismatch with the repeated mask.
8448 return false;
8449 }
8450 return true;
8451}
8452
8453/// Test whether a shuffle mask is equivalent within each 128-bit lane.
8454static bool
8455is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8456 SmallVectorImpl<int> &RepeatedMask) {
8457 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8458}
8459
8460/// Test whether a shuffle mask is equivalent within each 256-bit lane.
8461static bool
8462is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8463 SmallVectorImpl<int> &RepeatedMask) {
8464 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8465}
8466
8467/// Test whether a target shuffle mask is equivalent within each sub-lane.
8468/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8469static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8470 ArrayRef<int> Mask,
8471 SmallVectorImpl<int> &RepeatedMask) {
8472 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8473 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8474 int Size = Mask.size();
8475 for (int i = 0; i < Size; ++i) {
8476 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8476, __extension__ __PRETTY_FUNCTION__))
;
8477 if (Mask[i] == SM_SentinelUndef)
8478 continue;
8479 if (Mask[i] == SM_SentinelZero) {
8480 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8481 return false;
8482 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8483 continue;
8484 }
8485 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8486 // This entry crosses lanes, so there is no way to model this shuffle.
8487 return false;
8488
8489 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8490 // Adjust second vector indices to start at LaneSize instead of Size.
8491 int LocalM =
8492 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8493 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8494 // This is the first non-undef entry in this slot of a 128-bit lane.
8495 RepeatedMask[i % LaneSize] = LocalM;
8496 else if (RepeatedMask[i % LaneSize] != LocalM)
8497 // Found a mismatch with the repeated mask.
8498 return false;
8499 }
8500 return true;
8501}
8502
8503/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8504/// arguments.
8505///
8506/// This is a fast way to test a shuffle mask against a fixed pattern:
8507///
8508/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8509///
8510/// It returns true if the mask is exactly as wide as the argument list, and
8511/// each element of the mask is either -1 (signifying undef) or the value given
8512/// in the argument.
8513static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8514 ArrayRef<int> ExpectedMask) {
8515 if (Mask.size() != ExpectedMask.size())
8516 return false;
8517
8518 int Size = Mask.size();
8519
8520 // If the values are build vectors, we can look through them to find
8521 // equivalent inputs that make the shuffles equivalent.
8522 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8523 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8524
8525 for (int i = 0; i < Size; ++i) {
8526 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8526, __extension__ __PRETTY_FUNCTION__))
;
8527 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8528 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8529 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8530 if (!MaskBV || !ExpectedBV ||
8531 MaskBV->getOperand(Mask[i] % Size) !=
8532 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8533 return false;
8534 }
8535 }
8536
8537 return true;
8538}
8539
8540/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8541///
8542/// The masks must be exactly the same width.
8543///
8544/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8545/// value in ExpectedMask is always accepted. Otherwise the indices must match.
8546///
8547/// SM_SentinelZero is accepted as a valid negative index but must match in both.
8548static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8549 ArrayRef<int> ExpectedMask) {
8550 int Size = Mask.size();
8551 if (Size != (int)ExpectedMask.size())
8552 return false;
8553
8554 for (int i = 0; i < Size; ++i)
8555 if (Mask[i] == SM_SentinelUndef)
8556 continue;
8557 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8558 return false;
8559 else if (Mask[i] != ExpectedMask[i])
8560 return false;
8561
8562 return true;
8563}
8564
8565// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8566// mask.
8567static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8568 const APInt &Zeroable) {
8569 int NumElts = Mask.size();
8570 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes")(static_cast <bool> (NumElts == (int)Zeroable.getBitWidth
() && "Mismatch mask sizes") ? void (0) : __assert_fail
("NumElts == (int)Zeroable.getBitWidth() && \"Mismatch mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8570, __extension__ __PRETTY_FUNCTION__))
;
8571
8572 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8573 for (int i = 0; i != NumElts; ++i) {
8574 int M = Mask[i];
8575 if (M == SM_SentinelUndef)
8576 continue;
8577 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index")(static_cast <bool> (0 <= M && M < (2 * NumElts
) && "Out of range shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (2 * NumElts) && \"Out of range shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8577, __extension__ __PRETTY_FUNCTION__))
;
8578 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8579 }
8580 return TargetMask;
8581}
8582
8583// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8584// instructions.
8585static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8586 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8587 return false;
8588
8589 SmallVector<int, 8> Unpcklwd;
8590 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8591 /* Unary = */ false);
8592 SmallVector<int, 8> Unpckhwd;
8593 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8594 /* Unary = */ false);
8595 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8596 isTargetShuffleEquivalent(Mask, Unpckhwd));
8597 return IsUnpackwdMask;
8598}
8599
8600/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8601///
8602/// This helper function produces an 8-bit shuffle immediate corresponding to
8603/// the ubiquitous shuffle encoding scheme used in x86 instructions for
8604/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8605/// example.
8606///
8607/// NB: We rely heavily on "undef" masks preserving the input lane.
8608static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8609 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8609, __extension__ __PRETTY_FUNCTION__))
;
8610 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8610, __extension__ __PRETTY_FUNCTION__))
;
8611 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8611, __extension__ __PRETTY_FUNCTION__))
;
8612 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8612, __extension__ __PRETTY_FUNCTION__))
;
8613 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8613, __extension__ __PRETTY_FUNCTION__))
;
8614
8615 unsigned Imm = 0;
8616 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8617 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8618 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8619 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8620 return Imm;
8621}
8622
8623static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8624 SelectionDAG &DAG) {
8625 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8626}
8627
8628/// \brief Compute whether each element of a shuffle is zeroable.
8629///
8630/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8631/// Either it is an undef element in the shuffle mask, the element of the input
8632/// referenced is undef, or the element of the input referenced is known to be
8633/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8634/// as many lanes with this technique as possible to simplify the remaining
8635/// shuffle.
8636static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8637 SDValue V1, SDValue V2) {
8638 APInt Zeroable(Mask.size(), 0);
8639 V1 = peekThroughBitcasts(V1);
8640 V2 = peekThroughBitcasts(V2);
8641
8642 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8643 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8644
8645 int VectorSizeInBits = V1.getValueSizeInBits();
8646 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8647 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8647, __extension__ __PRETTY_FUNCTION__))
;
8648
8649 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8650 int M = Mask[i];
8651 // Handle the easy cases.
8652 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8653 Zeroable.setBit(i);
8654 continue;
8655 }
8656
8657 // Determine shuffle input and normalize the mask.
8658 SDValue V = M < Size ? V1 : V2;
8659 M %= Size;
8660
8661 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8662 if (V.getOpcode() != ISD::BUILD_VECTOR)
8663 continue;
8664
8665 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8666 // the (larger) source element must be UNDEF/ZERO.
8667 if ((Size % V.getNumOperands()) == 0) {
8668 int Scale = Size / V->getNumOperands();
8669 SDValue Op = V.getOperand(M / Scale);
8670 if (Op.isUndef() || X86::isZeroNode(Op))
8671 Zeroable.setBit(i);
8672 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8673 APInt Val = Cst->getAPIntValue();
8674 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8675 Val = Val.getLoBits(ScalarSizeInBits);
8676 if (Val == 0)
8677 Zeroable.setBit(i);
8678 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8679 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8680 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8681 Val = Val.getLoBits(ScalarSizeInBits);
8682 if (Val == 0)
8683 Zeroable.setBit(i);
8684 }
8685 continue;
8686 }
8687
8688 // If the BUILD_VECTOR has more elements then all the (smaller) source
8689 // elements must be UNDEF or ZERO.
8690 if ((V.getNumOperands() % Size) == 0) {
8691 int Scale = V->getNumOperands() / Size;
8692 bool AllZeroable = true;
8693 for (int j = 0; j < Scale; ++j) {
8694 SDValue Op = V.getOperand((M * Scale) + j);
8695 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8696 }
8697 if (AllZeroable)
8698 Zeroable.setBit(i);
8699 continue;
8700 }
8701 }
8702
8703 return Zeroable;
8704}
8705
8706// The Shuffle result is as follow:
8707// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8708// Each Zeroable's element correspond to a particular Mask's element.
8709// As described in computeZeroableShuffleElements function.
8710//
8711// The function looks for a sub-mask that the nonzero elements are in
8712// increasing order. If such sub-mask exist. The function returns true.
8713static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8714 ArrayRef<int> Mask, const EVT &VectorType,
8715 bool &IsZeroSideLeft) {
8716 int NextElement = -1;
8717 // Check if the Mask's nonzero elements are in increasing order.
8718 for (int i = 0, e = Mask.size(); i < e; i++) {
8719 // Checks if the mask's zeros elements are built from only zeros.
8720 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8720, __extension__ __PRETTY_FUNCTION__))
;
8721 if (Mask[i] < 0)
8722 return false;
8723 if (Zeroable[i])
8724 continue;
8725 // Find the lowest non zero element
8726 if (NextElement < 0) {
8727 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8728 IsZeroSideLeft = NextElement != 0;
8729 }
8730 // Exit if the mask's non zero elements are not in increasing order.
8731 if (NextElement != Mask[i])
8732 return false;
8733 NextElement++;
8734 }
8735 return true;
8736}
8737
8738/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8739static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8740 ArrayRef<int> Mask, SDValue V1,
8741 SDValue V2,
8742 const APInt &Zeroable,
8743 const X86Subtarget &Subtarget,
8744 SelectionDAG &DAG) {
8745 int Size = Mask.size();
8746 int LaneSize = 128 / VT.getScalarSizeInBits();
8747 const int NumBytes = VT.getSizeInBits() / 8;
8748 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8749
8750 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__))
8751 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__))
8752 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8752, __extension__ __PRETTY_FUNCTION__))
;
8753
8754 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8755 // Sign bit set in i8 mask means zero element.
8756 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8757
8758 SDValue V;
8759 for (int i = 0; i < NumBytes; ++i) {
8760 int M = Mask[i / NumEltBytes];
8761 if (M < 0) {
8762 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8763 continue;
8764 }
8765 if (Zeroable[i / NumEltBytes]) {
8766 PSHUFBMask[i] = ZeroMask;
8767 continue;
8768 }
8769
8770 // We can only use a single input of V1 or V2.
8771 SDValue SrcV = (M >= Size ? V2 : V1);
8772 if (V && V != SrcV)
8773 return SDValue();
8774 V = SrcV;
8775 M %= Size;
8776
8777 // PSHUFB can't cross lanes, ensure this doesn't happen.
8778 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8779 return SDValue();
8780
8781 M = M % LaneSize;
8782 M = M * NumEltBytes + (i % NumEltBytes);
8783 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8784 }
8785 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8785, __extension__ __PRETTY_FUNCTION__))
;
8786
8787 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8788 return DAG.getBitcast(
8789 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8790 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8791}
8792
8793static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8794 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8795 const SDLoc &dl);
8796
8797// X86 has dedicated shuffle that can be lowered to VEXPAND
8798static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8799 const APInt &Zeroable,
8800 ArrayRef<int> Mask, SDValue &V1,
8801 SDValue &V2, SelectionDAG &DAG,
8802 const X86Subtarget &Subtarget) {
8803 bool IsLeftZeroSide = true;
8804 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8805 IsLeftZeroSide))
8806 return SDValue();
8807 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8808 MVT IntegerType =
8809 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8810 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8811 unsigned NumElts = VT.getVectorNumElements();
8812 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8813, __extension__ __PRETTY_FUNCTION__))
8813 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8813, __extension__ __PRETTY_FUNCTION__))
;
8814 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8815 Subtarget, DAG, DL);
8816 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8817 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8818 return DAG.getSelect(DL, VT, VMask,
8819 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8820 ZeroVector);
8821}
8822
8823static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8824 unsigned &UnpackOpcode, bool IsUnary,
8825 ArrayRef<int> TargetMask, SDLoc &DL,
8826 SelectionDAG &DAG,
8827 const X86Subtarget &Subtarget) {
8828 int NumElts = VT.getVectorNumElements();
8829
8830 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8831 for (int i = 0; i != NumElts; i += 2) {
8832 int M1 = TargetMask[i + 0];
8833 int M2 = TargetMask[i + 1];
8834 Undef1 &= (SM_SentinelUndef == M1);
8835 Undef2 &= (SM_SentinelUndef == M2);
8836 Zero1 &= isUndefOrZero(M1);
8837 Zero2 &= isUndefOrZero(M2);
8838 }
8839 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8840, __extension__ __PRETTY_FUNCTION__))
8840 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 8840, __extension__ __PRETTY_FUNCTION__))
;
8841
8842 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8843 SmallVector<int, 64> Unpckl, Unpckh;
8844 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8845 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8846 UnpackOpcode = X86ISD::UNPCKL;
8847 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8848 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8849 return true;
8850 }
8851
8852 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8853 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8854 UnpackOpcode = X86ISD::UNPCKH;
8855 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8856 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8857 return true;
8858 }
8859
8860 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8861 if (IsUnary && (Zero1 || Zero2)) {
8862 // Don't bother if we can blend instead.
8863 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8864 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8865 return false;
8866
8867 bool MatchLo = true, MatchHi = true;
8868 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8869 int M = TargetMask[i];
8870
8871 // Ignore if the input is known to be zero or the index is undef.
8872 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8873 (M == SM_SentinelUndef))
8874 continue;
8875
8876 MatchLo &= (M == Unpckl[i]);
8877 MatchHi &= (M == Unpckh[i]);
8878 }
8879
8880 if (MatchLo || MatchHi) {
8881 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8882 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8883 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8884 return true;
8885 }
8886 }
8887
8888 // If a binary shuffle, commute and try again.
8889 if (!IsUnary) {
8890 ShuffleVectorSDNode::commuteMask(Unpckl);
8891 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8892 UnpackOpcode = X86ISD::UNPCKL;
8893 std::swap(V1, V2);
8894 return true;
8895 }
8896
8897 ShuffleVectorSDNode::commuteMask(Unpckh);
8898 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8899 UnpackOpcode = X86ISD::UNPCKH;
8900 std::swap(V1, V2);
8901 return true;
8902 }
8903 }
8904
8905 return false;
8906}
8907
8908// X86 has dedicated unpack instructions that can handle specific blend
8909// operations: UNPCKH and UNPCKL.
8910static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8911 ArrayRef<int> Mask, SDValue V1,
8912 SDValue V2, SelectionDAG &DAG) {
8913 SmallVector<int, 8> Unpckl;
8914 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8915 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8916 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8917
8918 SmallVector<int, 8> Unpckh;
8919 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8920 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8921 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8922
8923 // Commute and try again.
8924 ShuffleVectorSDNode::commuteMask(Unpckl);
8925 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8926 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8927
8928 ShuffleVectorSDNode::commuteMask(Unpckh);
8929 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8930 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8931
8932 return SDValue();
8933}
8934
8935// X86 has dedicated pack instructions that can handle specific truncation
8936// operations: PACKSS and PACKUS.
8937static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
8938 SDValue &V2, unsigned &PackOpcode,
8939 ArrayRef<int> TargetMask,
8940 SelectionDAG &DAG,
8941 const X86Subtarget &Subtarget) {
8942 unsigned NumElts = VT.getVectorNumElements();
8943 unsigned BitSize = VT.getScalarSizeInBits();
8944 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
8945 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
8946
8947 auto MatchPACK = [&](SDValue N1, SDValue N2) {
8948 SDValue VV1 = DAG.getBitcast(PackVT, N1);
8949 SDValue VV2 = DAG.getBitcast(PackVT, N2);
8950 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
8951 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
8952 V1 = VV1;
8953 V2 = VV2;
8954 SrcVT = PackVT;
8955 PackOpcode = X86ISD::PACKSS;
8956 return true;
8957 }
8958
8959 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
8960 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
8961 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
8962 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
8963 V1 = VV1;
8964 V2 = VV2;
8965 SrcVT = PackVT;
8966 PackOpcode = X86ISD::PACKUS;
8967 return true;
8968 }
8969 }
8970
8971 return false;
8972 };
8973
8974 // Try binary shuffle.
8975 SmallVector<int, 32> BinaryMask;
8976 createPackShuffleMask(VT, BinaryMask, false);
8977 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
8978 if (MatchPACK(V1, V2))
8979 return true;
8980
8981 // Try unary shuffle.
8982 SmallVector<int, 32> UnaryMask;
8983 createPackShuffleMask(VT, UnaryMask, true);
8984 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
8985 if (MatchPACK(V1, V1))
8986 return true;
8987
8988 return false;
8989}
8990
8991static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
8992 ArrayRef<int> Mask, SDValue V1,
8993 SDValue V2, SelectionDAG &DAG,
8994 const X86Subtarget &Subtarget) {
8995 MVT PackVT;
8996 unsigned PackOpcode;
8997 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
8998 Subtarget))
8999 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9000 DAG.getBitcast(PackVT, V2));
9001
9002 return SDValue();
9003}
9004
9005/// \brief Try to emit a bitmask instruction for a shuffle.
9006///
9007/// This handles cases where we can model a blend exactly as a bitmask due to
9008/// one of the inputs being zeroable.
9009static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9010 SDValue V2, ArrayRef<int> Mask,
9011 const APInt &Zeroable,
9012 SelectionDAG &DAG) {
9013 assert(!VT.isFloatingPoint() && "Floating point types are not supported")(static_cast <bool> (!VT.isFloatingPoint() && "Floating point types are not supported"
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"Floating point types are not supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9013, __extension__ __PRETTY_FUNCTION__))
;
9014 MVT EltVT = VT.getVectorElementType();
9015 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9016 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9017 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9018 SDValue V;
9019 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9020 if (Zeroable[i])
9021 continue;
9022 if (Mask[i] % Size != i)
9023 return SDValue(); // Not a blend.
9024 if (!V)
9025 V = Mask[i] < Size ? V1 : V2;
9026 else if (V != (Mask[i] < Size ? V1 : V2))
9027 return SDValue(); // Can only let one input through the mask.
9028
9029 VMaskOps[i] = AllOnes;
9030 }
9031 if (!V)
9032 return SDValue(); // No non-zeroable elements!
9033
9034 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9035 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9036}
9037
9038/// \brief Try to emit a blend instruction for a shuffle using bit math.
9039///
9040/// This is used as a fallback approach when first class blend instructions are
9041/// unavailable. Currently it is only suitable for integer vectors, but could
9042/// be generalized for floating point vectors if desirable.
9043static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9044 SDValue V2, ArrayRef<int> Mask,
9045 SelectionDAG &DAG) {
9046 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9046, __extension__ __PRETTY_FUNCTION__))
;
9047 MVT EltVT = VT.getVectorElementType();
9048 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9049 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9050 SmallVector<SDValue, 16> MaskOps;
9051 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9052 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9053 return SDValue(); // Shuffled input!
9054 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9055 }
9056
9057 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9058 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9059 // We have to cast V2 around.
9060 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9061 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9062 DAG.getBitcast(MaskVT, V1Mask),
9063 DAG.getBitcast(MaskVT, V2)));
9064 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9065}
9066
9067static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9068 SDValue PreservedSrc,
9069 const X86Subtarget &Subtarget,
9070 SelectionDAG &DAG);
9071
9072static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9073 MutableArrayRef<int> TargetMask,
9074 bool &ForceV1Zero, bool &ForceV2Zero,
9075 uint64_t &BlendMask) {
9076 bool V1IsZeroOrUndef =
9077 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9078 bool V2IsZeroOrUndef =
9079 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9080
9081 BlendMask = 0;
9082 ForceV1Zero = false, ForceV2Zero = false;
9083 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (TargetMask.size() <= 64 &&
"Shuffle mask too big for blend mask") ? void (0) : __assert_fail
("TargetMask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9083, __extension__ __PRETTY_FUNCTION__))
;
9084
9085 // Attempt to generate the binary blend mask. If an input is zero then
9086 // we can use any lane.
9087 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9088 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9089 int M = TargetMask[i];
9090 if (M == SM_SentinelUndef)
9091 continue;
9092 if (M == i)
9093 continue;
9094 if (M == i + Size) {
9095 BlendMask |= 1ull << i;
9096 continue;
9097 }
9098 if (M == SM_SentinelZero) {
9099 if (V1IsZeroOrUndef) {
9100 ForceV1Zero = true;
9101 TargetMask[i] = i;
9102 continue;
9103 }
9104 if (V2IsZeroOrUndef) {
9105 ForceV2Zero = true;
9106 BlendMask |= 1ull << i;
9107 TargetMask[i] = i + Size;
9108 continue;
9109 }
9110 }
9111 return false;
9112 }
9113 return true;
9114}
9115
9116static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9117 int Scale) {
9118 uint64_t ScaledMask = 0;
9119 for (int i = 0; i != Size; ++i)
9120 if (BlendMask & (1ull << i))
9121 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9122 return ScaledMask;
9123}
9124
9125/// \brief Try to emit a blend instruction for a shuffle.
9126///
9127/// This doesn't do any checks for the availability of instructions for blending
9128/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9129/// be matched in the backend with the type given. What it does check for is
9130/// that the shuffle mask is a blend, or convertible into a blend with zero.
9131static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9132 SDValue V2, ArrayRef<int> Original,
9133 const APInt &Zeroable,
9134 const X86Subtarget &Subtarget,
9135 SelectionDAG &DAG) {
9136 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9137
9138 uint64_t BlendMask = 0;
9139 bool ForceV1Zero = false, ForceV2Zero = false;
9140 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9141 BlendMask))
9142 return SDValue();
9143
9144 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9145 if (ForceV1Zero)
9146 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9147 if (ForceV2Zero)
9148 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9149
9150 switch (VT.SimpleTy) {
9151 case MVT::v2f64:
9152 case MVT::v4f32:
9153 case MVT::v4f64:
9154 case MVT::v8f32:
9155 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9156 DAG.getConstant(BlendMask, DL, MVT::i8));
9157
9158 case MVT::v4i64:
9159 case MVT::v8i32:
9160 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9160, __extension__ __PRETTY_FUNCTION__))
;
9161 LLVM_FALLTHROUGH[[clang::fallthrough]];
9162 case MVT::v2i64:
9163 case MVT::v4i32:
9164 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9165 // that instruction.
9166 if (Subtarget.hasAVX2()) {
9167 // Scale the blend by the number of 32-bit dwords per element.
9168 int Scale = VT.getScalarSizeInBits() / 32;
9169 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9170 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9171 V1 = DAG.getBitcast(BlendVT, V1);
9172 V2 = DAG.getBitcast(BlendVT, V2);
9173 return DAG.getBitcast(
9174 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9175 DAG.getConstant(BlendMask, DL, MVT::i8)));
9176 }
9177 LLVM_FALLTHROUGH[[clang::fallthrough]];
9178 case MVT::v8i16: {
9179 // For integer shuffles we need to expand the mask and cast the inputs to
9180 // v8i16s prior to blending.
9181 int Scale = 8 / VT.getVectorNumElements();
9182 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9183 V1 = DAG.getBitcast(MVT::v8i16, V1);
9184 V2 = DAG.getBitcast(MVT::v8i16, V2);
9185 return DAG.getBitcast(VT,
9186 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9187 DAG.getConstant(BlendMask, DL, MVT::i8)));
9188 }
9189
9190 case MVT::v16i16: {
9191 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9191, __extension__ __PRETTY_FUNCTION__))
;
9192 SmallVector<int, 8> RepeatedMask;
9193 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9194 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9195 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9195, __extension__ __PRETTY_FUNCTION__))
;
9196 BlendMask = 0;
9197 for (int i = 0; i < 8; ++i)
9198 if (RepeatedMask[i] >= 8)
9199 BlendMask |= 1ull << i;
9200 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9201 DAG.getConstant(BlendMask, DL, MVT::i8));
9202 }
9203 LLVM_FALLTHROUGH[[clang::fallthrough]];
9204 }
9205 case MVT::v16i8:
9206 case MVT::v32i8: {
9207 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&(static_cast <bool> ((VT.is128BitVector() || Subtarget.
hasAVX2()) && "256-bit byte-blends require AVX2 support!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9208, __extension__ __PRETTY_FUNCTION__))
9208 "256-bit byte-blends require AVX2 support!")(static_cast <bool> ((VT.is128BitVector() || Subtarget.
hasAVX2()) && "256-bit byte-blends require AVX2 support!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || Subtarget.hasAVX2()) && \"256-bit byte-blends require AVX2 support!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9208, __extension__ __PRETTY_FUNCTION__))
;
9209
9210 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9211 MVT IntegerType =
9212 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9213 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9214 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9215 }
9216
9217 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9218 if (SDValue Masked =
9219 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9220 return Masked;
9221
9222 // Scale the blend by the number of bytes per element.
9223 int Scale = VT.getScalarSizeInBits() / 8;
9224
9225 // This form of blend is always done on bytes. Compute the byte vector
9226 // type.
9227 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9228
9229 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9230 // mix of LLVM's code generator and the x86 backend. We tell the code
9231 // generator that boolean values in the elements of an x86 vector register
9232 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9233 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9234 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9235 // of the element (the remaining are ignored) and 0 in that high bit would
9236 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9237 // the LLVM model for boolean values in vector elements gets the relevant
9238 // bit set, it is set backwards and over constrained relative to x86's
9239 // actual model.
9240 SmallVector<SDValue, 32> VSELECTMask;
9241 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9242 for (int j = 0; j < Scale; ++j)
9243 VSELECTMask.push_back(
9244 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9245 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9246 MVT::i8));
9247
9248 V1 = DAG.getBitcast(BlendVT, V1);
9249 V2 = DAG.getBitcast(BlendVT, V2);
9250 return DAG.getBitcast(
9251 VT,
9252 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9253 V1, V2));
9254 }
9255 case MVT::v16f32:
9256 case MVT::v8f64:
9257 case MVT::v8i64:
9258 case MVT::v16i32:
9259 case MVT::v32i16:
9260 case MVT::v64i8: {
9261 MVT IntegerType =
9262 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9263 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9264 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9265 }
9266 default:
9267 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9267)
;
9268 }
9269}
9270
9271/// \brief Try to lower as a blend of elements from two inputs followed by
9272/// a single-input permutation.
9273///
9274/// This matches the pattern where we can blend elements from two inputs and
9275/// then reduce the shuffle to a single-input permutation.
9276static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9277 SDValue V1, SDValue V2,
9278 ArrayRef<int> Mask,
9279 SelectionDAG &DAG) {
9280 // We build up the blend mask while checking whether a blend is a viable way
9281 // to reduce the shuffle.
9282 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9283 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9284
9285 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9286 if (Mask[i] < 0)
9287 continue;
9288
9289 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9289, __extension__ __PRETTY_FUNCTION__))
;
9290
9291 if (BlendMask[Mask[i] % Size] < 0)
9292 BlendMask[Mask[i] % Size] = Mask[i];
9293 else if (BlendMask[Mask[i] % Size] != Mask[i])
9294 return SDValue(); // Can't blend in the needed input!
9295
9296 PermuteMask[i] = Mask[i] % Size;
9297 }
9298
9299 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9300 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9301}
9302
9303/// \brief Generic routine to decompose a shuffle and blend into independent
9304/// blends and permutes.
9305///
9306/// This matches the extremely common pattern for handling combined
9307/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9308/// operations. It will try to pick the best arrangement of shuffles and
9309/// blends.
9310static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9311 MVT VT, SDValue V1,
9312 SDValue V2,
9313 ArrayRef<int> Mask,
9314 SelectionDAG &DAG) {
9315 // Shuffle the input elements into the desired positions in V1 and V2 and
9316 // blend them together.
9317 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9318 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9319 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9320 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9321 if (Mask[i] >= 0 && Mask[i] < Size) {
9322 V1Mask[i] = Mask[i];
9323 BlendMask[i] = i;
9324 } else if (Mask[i] >= Size) {
9325 V2Mask[i] = Mask[i] - Size;
9326 BlendMask[i] = i + Size;
9327 }
9328
9329 // Try to lower with the simpler initial blend strategy unless one of the
9330 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9331 // shuffle may be able to fold with a load or other benefit. However, when
9332 // we'll have to do 2x as many shuffles in order to achieve this, blending
9333 // first is a better strategy.
9334 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9335 if (SDValue BlendPerm =
9336 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9337 return BlendPerm;
9338
9339 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9340 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9341 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9342}
9343
9344/// \brief Try to lower a vector shuffle as a rotation.
9345///
9346/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9347static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9348 ArrayRef<int> Mask) {
9349 int NumElts = Mask.size();
9350
9351 // We need to detect various ways of spelling a rotation:
9352 // [11, 12, 13, 14, 15, 0, 1, 2]
9353 // [-1, 12, 13, 14, -1, -1, 1, -1]
9354 // [-1, -1, -1, -1, -1, -1, 1, 2]
9355 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9356 // [-1, 4, 5, 6, -1, -1, 9, -1]
9357 // [-1, 4, 5, 6, -1, -1, -1, -1]
9358 int Rotation = 0;
9359 SDValue Lo, Hi;
9360 for (int i = 0; i < NumElts; ++i) {
9361 int M = Mask[i];
9362 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9363, __extension__ __PRETTY_FUNCTION__))
9363 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9363, __extension__ __PRETTY_FUNCTION__))
;
9364 if (M < 0)
9365 continue;
9366
9367 // Determine where a rotated vector would have started.
9368 int StartIdx = i - (M % NumElts);
9369 if (StartIdx == 0)
9370 // The identity rotation isn't interesting, stop.
9371 return -1;
9372
9373 // If we found the tail of a vector the rotation must be the missing
9374 // front. If we found the head of a vector, it must be how much of the
9375 // head.
9376 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9377
9378 if (Rotation == 0)
9379 Rotation = CandidateRotation;
9380 else if (Rotation != CandidateRotation)
9381 // The rotations don't match, so we can't match this mask.
9382 return -1;
9383
9384 // Compute which value this mask is pointing at.
9385 SDValue MaskV = M < NumElts ? V1 : V2;
9386
9387 // Compute which of the two target values this index should be assigned
9388 // to. This reflects whether the high elements are remaining or the low
9389 // elements are remaining.
9390 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9391
9392 // Either set up this value if we've not encountered it before, or check
9393 // that it remains consistent.
9394 if (!TargetV)
9395 TargetV = MaskV;
9396 else if (TargetV != MaskV)
9397 // This may be a rotation, but it pulls from the inputs in some
9398 // unsupported interleaving.
9399 return -1;
9400 }
9401
9402 // Check that we successfully analyzed the mask, and normalize the results.
9403 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9403, __extension__ __PRETTY_FUNCTION__))
;
9404 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9404, __extension__ __PRETTY_FUNCTION__))
;
9405 if (!Lo)
9406 Lo = Hi;
9407 else if (!Hi)
9408 Hi = Lo;
9409
9410 V1 = Lo;
9411 V2 = Hi;
9412
9413 return Rotation;
9414}
9415
9416/// \brief Try to lower a vector shuffle as a byte rotation.
9417///
9418/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9419/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9420/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9421/// try to generically lower a vector shuffle through such an pattern. It
9422/// does not check for the profitability of lowering either as PALIGNR or
9423/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9424/// This matches shuffle vectors that look like:
9425///
9426/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9427///
9428/// Essentially it concatenates V1 and V2, shifts right by some number of
9429/// elements, and takes the low elements as the result. Note that while this is
9430/// specified as a *right shift* because x86 is little-endian, it is a *left
9431/// rotate* of the vector lanes.
9432static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9433 ArrayRef<int> Mask) {
9434 // Don't accept any shuffles with zero elements.
9435 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9436 return -1;
9437
9438 // PALIGNR works on 128-bit lanes.
9439 SmallVector<int, 16> RepeatedMask;
9440 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9441 return -1;
9442
9443 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9444 if (Rotation <= 0)
9445 return -1;
9446
9447 // PALIGNR rotates bytes, so we need to scale the
9448 // rotation based on how many bytes are in the vector lane.
9449 int NumElts = RepeatedMask.size();
9450 int Scale = 16 / NumElts;
9451 return Rotation * Scale;
9452}
9453
9454static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9455 SDValue V1, SDValue V2,
9456 ArrayRef<int> Mask,
9457 const X86Subtarget &Subtarget,
9458 SelectionDAG &DAG) {
9459 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9459, __extension__ __PRETTY_FUNCTION__))
;
9460
9461 SDValue Lo = V1, Hi = V2;
9462 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9463 if (ByteRotation <= 0)
9464 return SDValue();
9465
9466 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9467 // PSLLDQ/PSRLDQ.
9468 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9469 Lo = DAG.getBitcast(ByteVT, Lo);
9470 Hi = DAG.getBitcast(ByteVT, Hi);
9471
9472 // SSSE3 targets can use the palignr instruction.
9473 if (Subtarget.hasSSSE3()) {
9474 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9475, __extension__ __PRETTY_FUNCTION__))
9475 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9475, __extension__ __PRETTY_FUNCTION__))
;
9476 return DAG.getBitcast(
9477 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9478 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9479 }
9480
9481 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __extension__ __PRETTY_FUNCTION__))
9482 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9482, __extension__ __PRETTY_FUNCTION__))
;
9483 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9484, __extension__ __PRETTY_FUNCTION__))
9484 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9484, __extension__ __PRETTY_FUNCTION__))
;
9485 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9486, __extension__ __PRETTY_FUNCTION__))
9486 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9486, __extension__ __PRETTY_FUNCTION__))
;
9487
9488 // Default SSE2 implementation
9489 int LoByteShift = 16 - ByteRotation;
9490 int HiByteShift = ByteRotation;
9491
9492 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9493 DAG.getConstant(LoByteShift, DL, MVT::i8));
9494 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9495 DAG.getConstant(HiByteShift, DL, MVT::i8));
9496 return DAG.getBitcast(VT,
9497 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9498}
9499
9500/// \brief Try to lower a vector shuffle as a dword/qword rotation.
9501///
9502/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9503/// rotation of the concatenation of two vectors; This routine will
9504/// try to generically lower a vector shuffle through such an pattern.
9505///
9506/// Essentially it concatenates V1 and V2, shifts right by some number of
9507/// elements, and takes the low elements as the result. Note that while this is
9508/// specified as a *right shift* because x86 is little-endian, it is a *left
9509/// rotate* of the vector lanes.
9510static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9511 SDValue V1, SDValue V2,
9512 ArrayRef<int> Mask,
9513 const X86Subtarget &Subtarget,
9514 SelectionDAG &DAG) {
9515 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9516, __extension__ __PRETTY_FUNCTION__))
9516 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9516, __extension__ __PRETTY_FUNCTION__))
;
9517
9518 // 128/256-bit vectors are only supported with VLX.
9519 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9520, __extension__ __PRETTY_FUNCTION__))
9520 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9520, __extension__ __PRETTY_FUNCTION__))
;
9521
9522 SDValue Lo = V1, Hi = V2;
9523 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9524 if (Rotation <= 0)
9525 return SDValue();
9526
9527 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9528 DAG.getConstant(Rotation, DL, MVT::i8));
9529}
9530
9531/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9532///
9533/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9534/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9535/// matches elements from one of the input vectors shuffled to the left or
9536/// right with zeroable elements 'shifted in'. It handles both the strictly
9537/// bit-wise element shifts and the byte shift across an entire 128-bit double
9538/// quad word lane.
9539///
9540/// PSHL : (little-endian) left bit shift.
9541/// [ zz, 0, zz, 2 ]
9542/// [ -1, 4, zz, -1 ]
9543/// PSRL : (little-endian) right bit shift.
9544/// [ 1, zz, 3, zz]
9545/// [ -1, -1, 7, zz]
9546/// PSLLDQ : (little-endian) left byte shift
9547/// [ zz, 0, 1, 2, 3, 4, 5, 6]
9548/// [ zz, zz, -1, -1, 2, 3, 4, -1]
9549/// [ zz, zz, zz, zz, zz, zz, -1, 1]
9550/// PSRLDQ : (little-endian) right byte shift
9551/// [ 5, 6, 7, zz, zz, zz, zz, zz]
9552/// [ -1, 5, 6, 7, zz, zz, zz, zz]
9553/// [ 1, 2, -1, -1, -1, -1, zz, zz]
9554static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9555 unsigned ScalarSizeInBits,
9556 ArrayRef<int> Mask, int MaskOffset,
9557 const APInt &Zeroable,
9558 const X86Subtarget &Subtarget) {
9559 int Size = Mask.size();
9560 unsigned SizeInBits = Size * ScalarSizeInBits;
9561
9562 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9563 for (int i = 0; i < Size; i += Scale)
9564 for (int j = 0; j < Shift; ++j)
9565 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9566 return false;
9567
9568 return true;
9569 };
9570
9571 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9572 for (int i = 0; i != Size; i += Scale) {
9573 unsigned Pos = Left ? i + Shift : i;
9574 unsigned Low = Left ? i : i + Shift;
9575 unsigned Len = Scale - Shift;
9576 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9577 return -1;
9578 }
9579
9580 int ShiftEltBits = ScalarSizeInBits * Scale;
9581 bool ByteShift = ShiftEltBits > 64;
9582 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9583 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9584 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9585
9586 // Normalize the scale for byte shifts to still produce an i64 element
9587 // type.
9588 Scale = ByteShift ? Scale / 2 : Scale;
9589
9590 // We need to round trip through the appropriate type for the shift.
9591 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9592 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9593 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9594 return (int)ShiftAmt;
9595 };
9596
9597 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9598 // keep doubling the size of the integer elements up to that. We can
9599 // then shift the elements of the integer vector by whole multiples of
9600 // their width within the elements of the larger integer vector. Test each
9601 // multiple to see if we can find a match with the moved element indices
9602 // and that the shifted in elements are all zeroable.
9603 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9604 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9605 for (int Shift = 1; Shift != Scale; ++Shift)
9606 for (bool Left : {true, false})
9607 if (CheckZeros(Shift, Scale, Left)) {
9608 int ShiftAmt = MatchShift(Shift, Scale, Left);
9609 if (0 < ShiftAmt)
9610 return ShiftAmt;
9611 }
9612
9613 // no match
9614 return -1;
9615}
9616
9617static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9618 SDValue V2, ArrayRef<int> Mask,
9619 const APInt &Zeroable,
9620 const X86Subtarget &Subtarget,
9621 SelectionDAG &DAG) {
9622 int Size = Mask.size();
9623 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9623, __extension__ __PRETTY_FUNCTION__))
;
9624
9625 MVT ShiftVT;
9626 SDValue V = V1;
9627 unsigned Opcode;
9628
9629 // Try to match shuffle against V1 shift.
9630 int ShiftAmt = matchVectorShuffleAsShift(
9631 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9632
9633 // If V1 failed, try to match shuffle against V2 shift.
9634 if (ShiftAmt < 0) {
9635 ShiftAmt =
9636 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9637 Mask, Size, Zeroable, Subtarget);
9638 V = V2;
9639 }
9640
9641 if (ShiftAmt < 0)
9642 return SDValue();
9643
9644 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9645, __extension__ __PRETTY_FUNCTION__))
9645 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9645, __extension__ __PRETTY_FUNCTION__))
;
9646 V = DAG.getBitcast(ShiftVT, V);
9647 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9648 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9649 return DAG.getBitcast(VT, V);
9650}
9651
9652// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9653// Remainder of lower half result is zero and upper half is all undef.
9654static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9655 ArrayRef<int> Mask, uint64_t &BitLen,
9656 uint64_t &BitIdx, const APInt &Zeroable) {
9657 int Size = Mask.size();
9658 int HalfSize = Size / 2;
9659 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9659, __extension__ __PRETTY_FUNCTION__))
;
9660 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnesValue() &&
"Fully zeroable shuffle mask") ? void (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9660, __extension__ __PRETTY_FUNCTION__))
;
9661
9662 // Upper half must be undefined.
9663 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9664 return false;
9665
9666 // Determine the extraction length from the part of the
9667 // lower half that isn't zeroable.
9668 int Len = HalfSize;
9669 for (; Len > 0; --Len)
9670 if (!Zeroable[Len - 1])
9671 break;
9672 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9672, __extension__ __PRETTY_FUNCTION__))
;
9673
9674 // Attempt to match first Len sequential elements from the lower half.
9675 SDValue Src;
9676 int Idx = -1;
9677 for (int i = 0; i != Len; ++i) {
9678 int M = Mask[i];
9679 if (M == SM_SentinelUndef)
9680 continue;
9681 SDValue &V = (M < Size ? V1 : V2);
9682 M = M % Size;
9683
9684 // The extracted elements must start at a valid index and all mask
9685 // elements must be in the lower half.
9686 if (i > M || M >= HalfSize)
9687 return false;
9688
9689 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9690 Src = V;
9691 Idx = M - i;
9692 continue;
9693 }
9694 return false;
9695 }
9696
9697 if (!Src || Idx < 0)
9698 return false;
9699
9700 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9700, __extension__ __PRETTY_FUNCTION__))
;
9701 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9702 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9703 V1 = Src;
9704 return true;
9705}
9706
9707// INSERTQ: Extract lowest Len elements from lower half of second source and
9708// insert over first source, starting at Idx.
9709// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9710static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9711 ArrayRef<int> Mask, uint64_t &BitLen,
9712 uint64_t &BitIdx) {
9713 int Size = Mask.size();
9714 int HalfSize = Size / 2;
9715 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9715, __extension__ __PRETTY_FUNCTION__))
;
9716
9717 // Upper half must be undefined.
9718 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9719 return false;
9720
9721 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9722 SDValue Base;
9723
9724 // Attempt to match first source from mask before insertion point.
9725 if (isUndefInRange(Mask, 0, Idx)) {
9726 /* EMPTY */
9727 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9728 Base = V1;
9729 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9730 Base = V2;
9731 } else {
9732 continue;
9733 }
9734
9735 // Extend the extraction length looking to match both the insertion of
9736 // the second source and the remaining elements of the first.
9737 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9738 SDValue Insert;
9739 int Len = Hi - Idx;
9740
9741 // Match insertion.
9742 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9743 Insert = V1;
9744 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9745 Insert = V2;
9746 } else {
9747 continue;
9748 }
9749
9750 // Match the remaining elements of the lower half.
9751 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9752 /* EMPTY */
9753 } else if ((!Base || (Base == V1)) &&
9754 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9755 Base = V1;
9756 } else if ((!Base || (Base == V2)) &&
9757 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9758 Size + Hi)) {
9759 Base = V2;
9760 } else {
9761 continue;
9762 }
9763
9764 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9765 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9766 V1 = Base;
9767 V2 = Insert;
9768 return true;
9769 }
9770 }
9771
9772 return false;
9773}
9774
9775/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9776static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9777 SDValue V2, ArrayRef<int> Mask,
9778 const APInt &Zeroable,
9779 SelectionDAG &DAG) {
9780 uint64_t BitLen, BitIdx;
9781 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9782 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9783 DAG.getConstant(BitLen, DL, MVT::i8),
9784 DAG.getConstant(BitIdx, DL, MVT::i8));
9785
9786 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9787 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9788 V2 ? V2 : DAG.getUNDEF(VT),
9789 DAG.getConstant(BitLen, DL, MVT::i8),
9790 DAG.getConstant(BitIdx, DL, MVT::i8));
9791
9792 return SDValue();
9793}
9794
9795/// \brief Lower a vector shuffle as a zero or any extension.
9796///
9797/// Given a specific number of elements, element bit width, and extension
9798/// stride, produce either a zero or any extension based on the available
9799/// features of the subtarget. The extended elements are consecutive and
9800/// begin and can start from an offsetted element index in the input; to
9801/// avoid excess shuffling the offset must either being in the bottom lane
9802/// or at the start of a higher lane. All extended elements must be from
9803/// the same lane.
9804static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9805 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9806 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9807 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9807, __extension__ __PRETTY_FUNCTION__))
;
9808 int EltBits = VT.getScalarSizeInBits();
9809 int NumElements = VT.getVectorNumElements();
9810 int NumEltsPerLane = 128 / EltBits;
9811 int OffsetLane = Offset / NumEltsPerLane;
9812 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9813, __extension__ __PRETTY_FUNCTION__))
9813 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9813, __extension__ __PRETTY_FUNCTION__))
;
9814 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9814, __extension__ __PRETTY_FUNCTION__))
;
9815 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9815, __extension__ __PRETTY_FUNCTION__))
;
9816 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9817, __extension__ __PRETTY_FUNCTION__))
9817 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9817, __extension__ __PRETTY_FUNCTION__))
;
9818
9819 // Check that an index is in same lane as the base offset.
9820 auto SafeOffset = [&](int Idx) {
9821 return OffsetLane == (Idx / NumEltsPerLane);
9822 };
9823
9824 // Shift along an input so that the offset base moves to the first element.
9825 auto ShuffleOffset = [&](SDValue V) {
9826 if (!Offset)
9827 return V;
9828
9829 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9830 for (int i = 0; i * Scale < NumElements; ++i) {
9831 int SrcIdx = i + Offset;
9832 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9833 }
9834 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9835 };
9836
9837 // Found a valid zext mask! Try various lowering strategies based on the
9838 // input type and available ISA extensions.
9839 if (Subtarget.hasSSE41()) {
9840 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9841 // PUNPCK will catch this in a later shuffle match.
9842 if (Offset && Scale == 2 && VT.is128BitVector())
9843 return SDValue();
9844 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9845 NumElements / Scale);
9846 InputV = ShuffleOffset(InputV);
9847 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9848 return DAG.getBitcast(VT, InputV);
9849 }
9850
9851 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9851, __extension__ __PRETTY_FUNCTION__))
;
9852
9853 // For any extends we can cheat for larger element sizes and use shuffle
9854 // instructions that can fold with a load and/or copy.
9855 if (AnyExt && EltBits == 32) {
9856 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9857 -1};
9858 return DAG.getBitcast(
9859 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9860 DAG.getBitcast(MVT::v4i32, InputV),
9861 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9862 }
9863 if (AnyExt && EltBits == 16 && Scale > 2) {
9864 int PSHUFDMask[4] = {Offset / 2, -1,
9865 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9866 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9867 DAG.getBitcast(MVT::v4i32, InputV),
9868 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9869 int PSHUFWMask[4] = {1, -1, -1, -1};
9870 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9871 return DAG.getBitcast(
9872 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9873 DAG.getBitcast(MVT::v8i16, InputV),
9874 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9875 }
9876
9877 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9878 // to 64-bits.
9879 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9880 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9880, __extension__ __PRETTY_FUNCTION__))
;
9881 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9881, __extension__ __PRETTY_FUNCTION__))
;
9882
9883 int LoIdx = Offset * EltBits;
9884 SDValue Lo = DAG.getBitcast(
9885 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9886 DAG.getConstant(EltBits, DL, MVT::i8),
9887 DAG.getConstant(LoIdx, DL, MVT::i8)));
9888
9889 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9890 !SafeOffset(Offset + 1))
9891 return DAG.getBitcast(VT, Lo);
9892
9893 int HiIdx = (Offset + 1) * EltBits;
9894 SDValue Hi = DAG.getBitcast(
9895 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9896 DAG.getConstant(EltBits, DL, MVT::i8),
9897 DAG.getConstant(HiIdx, DL, MVT::i8)));
9898 return DAG.getBitcast(VT,
9899 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9900 }
9901
9902 // If this would require more than 2 unpack instructions to expand, use
9903 // pshufb when available. We can only use more than 2 unpack instructions
9904 // when zero extending i8 elements which also makes it easier to use pshufb.
9905 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9906 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9906, __extension__ __PRETTY_FUNCTION__))
;
9907 SDValue PSHUFBMask[16];
9908 for (int i = 0; i < 16; ++i) {
9909 int Idx = Offset + (i / Scale);
9910 PSHUFBMask[i] = DAG.getConstant(
9911 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9912 }
9913 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9914 return DAG.getBitcast(
9915 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9916 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9917 }
9918
9919 // If we are extending from an offset, ensure we start on a boundary that
9920 // we can unpack from.
9921 int AlignToUnpack = Offset % (NumElements / Scale);
9922 if (AlignToUnpack) {
9923 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9924 for (int i = AlignToUnpack; i < NumElements; ++i)
9925 ShMask[i - AlignToUnpack] = i;
9926 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9927 Offset -= AlignToUnpack;
9928 }
9929
9930 // Otherwise emit a sequence of unpacks.
9931 do {
9932 unsigned UnpackLoHi = X86ISD::UNPCKL;
9933 if (Offset >= (NumElements / 2)) {
9934 UnpackLoHi = X86ISD::UNPCKH;
9935 Offset -= (NumElements / 2);
9936 }
9937
9938 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9939 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9940 : getZeroVector(InputVT, Subtarget, DAG, DL);
9941 InputV = DAG.getBitcast(InputVT, InputV);
9942 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9943 Scale /= 2;
9944 EltBits *= 2;
9945 NumElements /= 2;
9946 } while (Scale > 1);
9947 return DAG.getBitcast(VT, InputV);
9948}
9949
9950/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9951///
9952/// This routine will try to do everything in its power to cleverly lower
9953/// a shuffle which happens to match the pattern of a zero extend. It doesn't
9954/// check for the profitability of this lowering, it tries to aggressively
9955/// match this pattern. It will use all of the micro-architectural details it
9956/// can to emit an efficient lowering. It handles both blends with all-zero
9957/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9958/// masking out later).
9959///
9960/// The reason we have dedicated lowering for zext-style shuffles is that they
9961/// are both incredibly common and often quite performance sensitive.
9962static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9963 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9964 const APInt &Zeroable, const X86Subtarget &Subtarget,
9965 SelectionDAG &DAG) {
9966 int Bits = VT.getSizeInBits();
9967 int NumLanes = Bits / 128;
9968 int NumElements = VT.getVectorNumElements();
9969 int NumEltsPerLane = NumElements / NumLanes;
9970 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9971, __extension__ __PRETTY_FUNCTION__))
9971 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9971, __extension__ __PRETTY_FUNCTION__))
;
9972 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 9972, __extension__ __PRETTY_FUNCTION__))
;
9973
9974 // Define a helper function to check a particular ext-scale and lower to it if
9975 // valid.
9976 auto Lower = [&](int Scale) -> SDValue {
9977 SDValue InputV;
9978 bool AnyExt = true;
9979 int Offset = 0;
9980 int Matches = 0;
9981 for (int i = 0; i < NumElements; ++i) {
9982 int M = Mask[i];
9983 if (M < 0)
9984 continue; // Valid anywhere but doesn't tell us anything.
9985 if (i % Scale != 0) {
9986 // Each of the extended elements need to be zeroable.
9987 if (!Zeroable[i])
9988 return SDValue();
9989
9990 // We no longer are in the anyext case.
9991 AnyExt = false;
9992 continue;
9993 }
9994
9995 // Each of the base elements needs to be consecutive indices into the
9996 // same input vector.
9997 SDValue V = M < NumElements ? V1 : V2;
9998 M = M % NumElements;
9999 if (!InputV) {
10000 InputV = V;
10001 Offset = M - (i / Scale);
10002 } else if (InputV != V)
10003 return SDValue(); // Flip-flopping inputs.
10004
10005 // Offset must start in the lowest 128-bit lane or at the start of an
10006 // upper lane.
10007 // FIXME: Is it ever worth allowing a negative base offset?
10008 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10009 (Offset % NumEltsPerLane) == 0))
10010 return SDValue();
10011
10012 // If we are offsetting, all referenced entries must come from the same
10013 // lane.
10014 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10015 return SDValue();
10016
10017 if ((M % NumElements) != (Offset + (i / Scale)))
10018 return SDValue(); // Non-consecutive strided elements.
10019 Matches++;
10020 }
10021
10022 // If we fail to find an input, we have a zero-shuffle which should always
10023 // have already been handled.
10024 // FIXME: Maybe handle this here in case during blending we end up with one?
10025 if (!InputV)
10026 return SDValue();
10027
10028 // If we are offsetting, don't extend if we only match a single input, we
10029 // can always do better by using a basic PSHUF or PUNPCK.
10030 if (Offset != 0 && Matches < 2)
10031 return SDValue();
10032
10033 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10034 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10035 };
10036
10037 // The widest scale possible for extending is to a 64-bit integer.
10038 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10039, __extension__ __PRETTY_FUNCTION__))
10039 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10039, __extension__ __PRETTY_FUNCTION__))
;
10040 int NumExtElements = Bits / 64;
10041
10042 // Each iteration, try extending the elements half as much, but into twice as
10043 // many elements.
10044 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10045 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10046, __extension__ __PRETTY_FUNCTION__))
10046 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10046, __extension__ __PRETTY_FUNCTION__))
;
10047 if (SDValue V = Lower(NumElements / NumExtElements))
10048 return V;
10049 }
10050
10051 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10052 if (Bits != 128)
10053 return SDValue();
10054
10055 // Returns one of the source operands if the shuffle can be reduced to a
10056 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10057 auto CanZExtLowHalf = [&]() {
10058 for (int i = NumElements / 2; i != NumElements; ++i)
10059 if (!Zeroable[i])
10060 return SDValue();
10061 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10062 return V1;
10063 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10064 return V2;
10065 return SDValue();
10066 };
10067
10068 if (SDValue V = CanZExtLowHalf()) {
10069 V = DAG.getBitcast(MVT::v2i64, V);
10070 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10071 return DAG.getBitcast(VT, V);
10072 }
10073
10074 // No viable ext lowering found.
10075 return SDValue();
10076}
10077
10078/// \brief Try to get a scalar value for a specific element of a vector.
10079///
10080/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10081static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10082 SelectionDAG &DAG) {
10083 MVT VT = V.getSimpleValueType();
10084 MVT EltVT = VT.getVectorElementType();
10085 V = peekThroughBitcasts(V);
10086
10087 // If the bitcasts shift the element size, we can't extract an equivalent
10088 // element from it.
10089 MVT NewVT = V.getSimpleValueType();
10090 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10091 return SDValue();
10092
10093 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10094 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10095 // Ensure the scalar operand is the same size as the destination.
10096 // FIXME: Add support for scalar truncation where possible.
10097 SDValue S = V.getOperand(Idx);
10098 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10099 return DAG.getBitcast(EltVT, S);
10100 }
10101
10102 return SDValue();
10103}
10104
10105/// \brief Helper to test for a load that can be folded with x86 shuffles.
10106///
10107/// This is particularly important because the set of instructions varies
10108/// significantly based on whether the operand is a load or not.
10109static bool isShuffleFoldableLoad(SDValue V) {
10110 V = peekThroughBitcasts(V);
10111 return ISD::isNON_EXTLoad(V.getNode());
10112}
10113
10114/// \brief Try to lower insertion of a single element into a zero vector.
10115///
10116/// This is a common pattern that we have especially efficient patterns to lower
10117/// across all subtarget feature sets.
10118static SDValue lowerVectorShuffleAsElementInsertion(
10119 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10120 const APInt &Zeroable, const X86Subtarget &Subtarget,
10121 SelectionDAG &DAG) {
10122 MVT ExtVT = VT;
10123 MVT EltVT = VT.getVectorElementType();
10124
10125 int V2Index =
10126 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10127 Mask.begin();
10128 bool IsV1Zeroable = true;
10129 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10130 if (i != V2Index && !Zeroable[i]) {
10131 IsV1Zeroable = false;
10132 break;
10133 }
10134
10135 // Check for a single input from a SCALAR_TO_VECTOR node.
10136 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10137 // all the smarts here sunk into that routine. However, the current
10138 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10139 // vector shuffle lowering is dead.
10140 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10141 DAG);
10142 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10143 // We need to zext the scalar if it is smaller than an i32.
10144 V2S = DAG.getBitcast(EltVT, V2S);
10145 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10146 // Using zext to expand a narrow element won't work for non-zero
10147 // insertions.
10148 if (!IsV1Zeroable)
10149 return SDValue();
10150
10151 // Zero-extend directly to i32.
10152 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10153 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10154 }
10155 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10156 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10157 EltVT == MVT::i16) {
10158 // Either not inserting from the low element of the input or the input
10159 // element size is too small to use VZEXT_MOVL to clear the high bits.
10160 return SDValue();
10161 }
10162
10163 if (!IsV1Zeroable) {
10164 // If V1 can't be treated as a zero vector we have fewer options to lower
10165 // this. We can't support integer vectors or non-zero targets cheaply, and
10166 // the V1 elements can't be permuted in any way.
10167 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10167, __extension__ __PRETTY_FUNCTION__))
;
10168 if (!VT.isFloatingPoint() || V2Index != 0)
10169 return SDValue();
10170 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10171 V1Mask[V2Index] = -1;
10172 if (!isNoopShuffleMask(V1Mask))
10173 return SDValue();
10174 if (!VT.is128BitVector())
10175 return SDValue();
10176
10177 // Otherwise, use MOVSD or MOVSS.
10178 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(static_cast <bool> ((EltVT == MVT::f32 || EltVT == MVT
::f64) && "Only two types of floating point element types to handle!"
) ? void (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10179, __extension__ __PRETTY_FUNCTION__))
10179 "Only two types of floating point element types to handle!")(static_cast <bool> ((EltVT == MVT::f32 || EltVT == MVT
::f64) && "Only two types of floating point element types to handle!"
) ? void (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10179, __extension__ __PRETTY_FUNCTION__))
;
10180 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10181 ExtVT, V1, V2);
10182 }
10183
10184 // This lowering only works for the low element with floating point vectors.
10185 if (VT.isFloatingPoint() && V2Index != 0)
10186 return SDValue();
10187
10188 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10189 if (ExtVT != VT)
10190 V2 = DAG.getBitcast(VT, V2);
10191
10192 if (V2Index != 0) {
10193 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10194 // the desired position. Otherwise it is more efficient to do a vector
10195 // shift left. We know that we can do a vector shift left because all
10196 // the inputs are zero.
10197 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10198 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10199 V2Shuffle[V2Index] = 0;
10200 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10201 } else {
10202 V2 = DAG.getBitcast(MVT::v16i8, V2);
10203 V2 = DAG.getNode(
10204 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10205 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10206 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10207 DAG.getDataLayout(), VT)));
10208 V2 = DAG.getBitcast(VT, V2);
10209 }
10210 }
10211 return V2;
10212}
10213
10214/// Try to lower broadcast of a single - truncated - integer element,
10215/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10216///
10217/// This assumes we have AVX2.
10218static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10219 SDValue V0, int BroadcastIdx,
10220 const X86Subtarget &Subtarget,
10221 SelectionDAG &DAG) {
10222 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10223, __extension__ __PRETTY_FUNCTION__))
10223 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10223, __extension__ __PRETTY_FUNCTION__))
;
10224
10225 EVT EltVT = VT.getVectorElementType();
10226 EVT V0VT = V0.getValueType();
10227
10228 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10228, __extension__ __PRETTY_FUNCTION__))
;
10229 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10229, __extension__ __PRETTY_FUNCTION__))
;
10230
10231 EVT V0EltVT = V0VT.getVectorElementType();
10232 if (!V0EltVT.isInteger())
10233 return SDValue();
10234
10235 const unsigned EltSize = EltVT.getSizeInBits();
10236 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10237
10238 // This is only a truncation if the original element type is larger.
10239 if (V0EltSize <= EltSize)
10240 return SDValue();
10241
10242 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10243, __extension__ __PRETTY_FUNCTION__))
10243 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10243, __extension__ __PRETTY_FUNCTION__))
;
10244
10245 const unsigned V0Opc = V0.getOpcode();
10246 const unsigned Scale = V0EltSize / EltSize;
10247 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10248
10249 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10250 V0Opc != ISD::BUILD_VECTOR)
10251 return SDValue();
10252
10253 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10254
10255 // If we're extracting non-least-significant bits, shift so we can truncate.
10256 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10257 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10258 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10259 if (const int OffsetIdx = BroadcastIdx % Scale)
10260 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10261 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10262
10263 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10264 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10265}
10266
10267/// \brief Try to lower broadcast of a single element.
10268///
10269/// For convenience, this code also bundles all of the subtarget feature set
10270/// filtering. While a little annoying to re-dispatch on type here, there isn't
10271/// a convenient way to factor it out.
10272static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10273 SDValue V1, SDValue V2,
10274 ArrayRef<int> Mask,
10275 const X86Subtarget &Subtarget,
10276 SelectionDAG &DAG) {
10277 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10278 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10279 (Subtarget.hasAVX2() && VT.isInteger())))
10280 return SDValue();
10281
10282 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10283 // we can only broadcast from a register with AVX2.
10284 unsigned NumElts = Mask.size();
10285 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10286 ? X86ISD::MOVDDUP
10287 : X86ISD::VBROADCAST;
10288 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10289
10290 // Check that the mask is a broadcast.
10291 int BroadcastIdx = -1;
10292 for (int i = 0; i != (int)NumElts; ++i) {
10293 SmallVector<int, 8> BroadcastMask(NumElts, i);
10294 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10295 BroadcastIdx = i;
10296 break;
10297 }
10298 }
10299
10300 if (BroadcastIdx < 0)
10301 return SDValue();
10302 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__))
10303 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__))
10304 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10304, __extension__ __PRETTY_FUNCTION__))
;
10305
10306 // Go up the chain of (vector) values to find a scalar load that we can
10307 // combine with the broadcast.
10308 SDValue V = V1;
10309 for (;;) {
10310 switch (V.getOpcode()) {
10311 case ISD::BITCAST: {
10312 SDValue VSrc = V.getOperand(0);
10313 MVT SrcVT = VSrc.getSimpleValueType();
10314 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10315 break;
10316 V = VSrc;
10317 continue;
10318 }
10319 case ISD::CONCAT_VECTORS: {
10320 int OperandSize = Mask.size() / V.getNumOperands();
10321 V = V.getOperand(BroadcastIdx / OperandSize);
10322 BroadcastIdx %= OperandSize;
10323 continue;
10324 }
10325 case ISD::INSERT_SUBVECTOR: {
10326 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10327 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10328 if (!ConstantIdx)
10329 break;
10330
10331 int BeginIdx = (int)ConstantIdx->getZExtValue();
10332 int EndIdx =
10333 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10334 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10335 BroadcastIdx -= BeginIdx;
10336 V = VInner;
10337 } else {
10338 V = VOuter;
10339 }
10340 continue;
10341 }
10342 }
10343 break;
10344 }
10345
10346 // Check if this is a broadcast of a scalar. We special case lowering
10347 // for scalars so that we can more effectively fold with loads.
10348 // First, look through bitcast: if the original value has a larger element
10349 // type than the shuffle, the broadcast element is in essence truncated.
10350 // Make that explicit to ease folding.
10351 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10352 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10353 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10354 return TruncBroadcast;
10355
10356 MVT BroadcastVT = VT;
10357
10358 // Peek through any bitcast (only useful for loads).
10359 SDValue BC = peekThroughBitcasts(V);
10360
10361 // Also check the simpler case, where we can directly reuse the scalar.
10362 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10363 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10364 V = V.getOperand(BroadcastIdx);
10365
10366 // If we can't broadcast from a register, check that the input is a load.
10367 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10368 return SDValue();
10369 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10370 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10371 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10372 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10373 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10374 ? X86ISD::MOVDDUP
10375 : Opcode;
10376 }
10377
10378 // If we are broadcasting a load that is only used by the shuffle
10379 // then we can reduce the vector load to the broadcasted scalar load.
10380 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10381 SDValue BaseAddr = Ld->getOperand(1);
10382 EVT SVT = BroadcastVT.getScalarType();
10383 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10384 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10385 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10386 DAG.getMachineFunction().getMachineMemOperand(
10387 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10388 DAG.makeEquivalentMemoryOrdering(Ld, V);
10389 } else if (!BroadcastFromReg) {
10390 // We can't broadcast from a vector register.
10391 return SDValue();
10392 } else if (BroadcastIdx != 0) {
10393 // We can only broadcast from the zero-element of a vector register,
10394 // but it can be advantageous to broadcast from the zero-element of a
10395 // subvector.
10396 if (!VT.is256BitVector() && !VT.is512BitVector())
10397 return SDValue();
10398
10399 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10400 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10401 return SDValue();
10402
10403 // Only broadcast the zero-element of a 128-bit subvector.
10404 unsigned EltSize = VT.getScalarSizeInBits();
10405 if (((BroadcastIdx * EltSize) % 128) != 0)
10406 return SDValue();
10407
10408 // The shuffle input might have been a bitcast we looked through; look at
10409 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10410 // later bitcast it to BroadcastVT.
10411 MVT SrcVT = V.getSimpleValueType();
10412 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10413, __extension__ __PRETTY_FUNCTION__))
10413 "Unexpected vector element size")(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10413, __extension__ __PRETTY_FUNCTION__))
;
10414 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&(static_cast <bool> ((SrcVT.is256BitVector() || SrcVT.is512BitVector
()) && "Unexpected vector size") ? void (0) : __assert_fail
("(SrcVT.is256BitVector() || SrcVT.is512BitVector()) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10415, __extension__ __PRETTY_FUNCTION__))
10415 "Unexpected vector size")(static_cast <bool> ((SrcVT.is256BitVector() || SrcVT.is512BitVector
()) && "Unexpected vector size") ? void (0) : __assert_fail
("(SrcVT.is256BitVector() || SrcVT.is512BitVector()) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10415, __extension__ __PRETTY_FUNCTION__))
;
10416
10417 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10418 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10419 DAG.getIntPtrConstant(BroadcastIdx, DL));
10420 }
10421
10422 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10423 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10424 DAG.getBitcast(MVT::f64, V));
10425
10426 // Bitcast back to the same scalar type as BroadcastVT.
10427 MVT SrcVT = V.getSimpleValueType();
10428 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10429 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10430, __extension__ __PRETTY_FUNCTION__))
10430 "Unexpected vector element size")(static_cast <bool> (SrcVT.getScalarSizeInBits() == BroadcastVT
.getScalarSizeInBits() && "Unexpected vector element size"
) ? void (0) : __assert_fail ("SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10430, __extension__ __PRETTY_FUNCTION__))
;
10431 if (SrcVT.isVector()) {
10432 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10433 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10434 } else {
10435 SrcVT = BroadcastVT.getScalarType();
10436 }
10437 V = DAG.getBitcast(SrcVT, V);
10438 }
10439
10440 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10441 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10442 V = DAG.getBitcast(MVT::f64, V);
10443 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10444 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10445 }
10446
10447 // We only support broadcasting from 128-bit vectors to minimize the
10448 // number of patterns we need to deal with in isel. So extract down to
10449 // 128-bits.
10450 if (SrcVT.getSizeInBits() > 128)
10451 V = extract128BitVector(V, 0, DAG, DL);
10452
10453 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10454}
10455
10456// Check for whether we can use INSERTPS to perform the shuffle. We only use
10457// INSERTPS when the V1 elements are already in the correct locations
10458// because otherwise we can just always use two SHUFPS instructions which
10459// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10460// perform INSERTPS if a single V1 element is out of place and all V2
10461// elements are zeroable.
10462static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10463 unsigned &InsertPSMask,
10464 const APInt &Zeroable,
10465 ArrayRef<int> Mask,
10466 SelectionDAG &DAG) {
10467 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10467, __extension__ __PRETTY_FUNCTION__))
;
10468 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10468, __extension__ __PRETTY_FUNCTION__))
;
10469 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10469, __extension__ __PRETTY_FUNCTION__))
;
10470
10471 // Attempt to match INSERTPS with one element from VA or VB being
10472 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10473 // are updated.
10474 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10475 ArrayRef<int> CandidateMask) {
10476 unsigned ZMask = 0;
10477 int VADstIndex = -1;
10478 int VBDstIndex = -1;
10479 bool VAUsedInPlace = false;
10480
10481 for (int i = 0; i < 4; ++i) {
10482 // Synthesize a zero mask from the zeroable elements (includes undefs).
10483 if (Zeroable[i]) {
10484 ZMask |= 1 << i;
10485 continue;
10486 }
10487
10488 // Flag if we use any VA inputs in place.
10489 if (i == CandidateMask[i]) {
10490 VAUsedInPlace = true;
10491 continue;
10492 }
10493
10494 // We can only insert a single non-zeroable element.
10495 if (VADstIndex >= 0 || VBDstIndex >= 0)
10496 return false;
10497
10498 if (CandidateMask[i] < 4) {
10499 // VA input out of place for insertion.
10500 VADstIndex = i;
10501 } else {
10502 // VB input for insertion.
10503 VBDstIndex = i;
10504 }
10505 }
10506
10507 // Don't bother if we have no (non-zeroable) element for insertion.
10508 if (VADstIndex < 0 && VBDstIndex < 0)
10509 return false;
10510
10511 // Determine element insertion src/dst indices. The src index is from the
10512 // start of the inserted vector, not the start of the concatenated vector.
10513 unsigned VBSrcIndex = 0;
10514 if (VADstIndex >= 0) {
10515 // If we have a VA input out of place, we use VA as the V2 element
10516 // insertion and don't use the original V2 at all.
10517 VBSrcIndex = CandidateMask[VADstIndex];
10518 VBDstIndex = VADstIndex;
10519 VB = VA;
10520 } else {
10521 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10522 }
10523
10524 // If no V1 inputs are used in place, then the result is created only from
10525 // the zero mask and the V2 insertion - so remove V1 dependency.
10526 if (!VAUsedInPlace)
10527 VA = DAG.getUNDEF(MVT::v4f32);
10528
10529 // Update V1, V2 and InsertPSMask accordingly.
10530 V1 = VA;
10531 V2 = VB;
10532
10533 // Insert the V2 element into the desired position.
10534 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10535 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10535, __extension__ __PRETTY_FUNCTION__))
;
10536 return true;
10537 };
10538
10539 if (matchAsInsertPS(V1, V2, Mask))
10540 return true;
10541
10542 // Commute and try again.
10543 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10544 ShuffleVectorSDNode::commuteMask(CommutedMask);
10545 if (matchAsInsertPS(V2, V1, CommutedMask))
10546 return true;
10547
10548 return false;
10549}
10550
10551static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10552 SDValue V2, ArrayRef<int> Mask,
10553 const APInt &Zeroable,
10554 SelectionDAG &DAG) {
10555 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10555, __extension__ __PRETTY_FUNCTION__))
;
10556 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10556, __extension__ __PRETTY_FUNCTION__))
;
10557
10558 // Attempt to match the insertps pattern.
10559 unsigned InsertPSMask;
10560 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10561 return SDValue();
10562
10563 // Insert the V2 element into the desired position.
10564 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10565 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10566}
10567
10568/// \brief Try to lower a shuffle as a permute of the inputs followed by an
10569/// UNPCK instruction.
10570///
10571/// This specifically targets cases where we end up with alternating between
10572/// the two inputs, and so can permute them into something that feeds a single
10573/// UNPCK instruction. Note that this routine only targets integer vectors
10574/// because for floating point vectors we have a generalized SHUFPS lowering
10575/// strategy that handles everything that doesn't *exactly* match an unpack,
10576/// making this clever lowering unnecessary.
10577static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10578 SDValue V1, SDValue V2,
10579 ArrayRef<int> Mask,
10580 SelectionDAG &DAG) {
10581 assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __extension__ __PRETTY_FUNCTION__))
10582 "This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10582, __extension__ __PRETTY_FUNCTION__))
;
10583 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10584, __extension__ __PRETTY_FUNCTION__))
10584 "This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10584, __extension__ __PRETTY_FUNCTION__))
;
10585 assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __extension__ __PRETTY_FUNCTION__))
10586 "This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __extension__ __PRETTY_FUNCTION__))
;
10587 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10587, __extension__ __PRETTY_FUNCTION__))
;
10588
10589 int Size = Mask.size();
10590
10591 int NumLoInputs =
10592 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10593 int NumHiInputs =
10594 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10595
10596 bool UnpackLo = NumLoInputs >= NumHiInputs;
10597
10598 auto TryUnpack = [&](int ScalarSize, int Scale) {
10599 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10600 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10601
10602 for (int i = 0; i < Size; ++i) {
10603 if (Mask[i] < 0)
10604 continue;
10605
10606 // Each element of the unpack contains Scale elements from this mask.
10607 int UnpackIdx = i / Scale;
10608
10609 // We only handle the case where V1 feeds the first slots of the unpack.
10610 // We rely on canonicalization to ensure this is the case.
10611 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10612 return SDValue();
10613
10614 // Setup the mask for this input. The indexing is tricky as we have to
10615 // handle the unpack stride.
10616 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10617 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10618 Mask[i] % Size;
10619 }
10620
10621 // If we will have to shuffle both inputs to use the unpack, check whether
10622 // we can just unpack first and shuffle the result. If so, skip this unpack.
10623 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10624 !isNoopShuffleMask(V2Mask))
10625 return SDValue();
10626
10627 // Shuffle the inputs into place.
10628 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10629 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10630
10631 // Cast the inputs to the type we will use to unpack them.
10632 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10633 V1 = DAG.getBitcast(UnpackVT, V1);
10634 V2 = DAG.getBitcast(UnpackVT, V2);
10635
10636 // Unpack the inputs and cast the result back to the desired type.
10637 return DAG.getBitcast(
10638 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10639 UnpackVT, V1, V2));
10640 };
10641
10642 // We try each unpack from the largest to the smallest to try and find one
10643 // that fits this mask.
10644 int OrigScalarSize = VT.getScalarSizeInBits();
10645 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10646 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10647 return Unpack;
10648
10649 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10650 // initial unpack.
10651 if (NumLoInputs == 0 || NumHiInputs == 0) {
10652 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10653, __extension__ __PRETTY_FUNCTION__))
10653 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10653, __extension__ __PRETTY_FUNCTION__))
;
10654 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10655
10656 // FIXME: We could consider the total complexity of the permute of each
10657 // possible unpacking. Or at the least we should consider how many
10658 // half-crossings are created.
10659 // FIXME: We could consider commuting the unpacks.
10660
10661 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10662 for (int i = 0; i < Size; ++i) {
10663 if (Mask[i] < 0)
10664 continue;
10665
10666 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __extension__ __PRETTY_FUNCTION__))
;
10667
10668 PermMask[i] =
10669 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10670 }
10671 return DAG.getVectorShuffle(
10672 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10673 DL, VT, V1, V2),
10674 DAG.getUNDEF(VT), PermMask);
10675 }
10676
10677 return SDValue();
10678}
10679
10680/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10681///
10682/// This is the basis function for the 2-lane 64-bit shuffles as we have full
10683/// support for floating point shuffles but not integer shuffles. These
10684/// instructions will incur a domain crossing penalty on some chips though so
10685/// it is better to avoid lowering through this for integer vectors where
10686/// possible.
10687static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10688 const APInt &Zeroable,
10689 SDValue V1, SDValue V2,
10690 const X86Subtarget &Subtarget,
10691 SelectionDAG &DAG) {
10692 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10692, __extension__ __PRETTY_FUNCTION__))
;
10693 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10693, __extension__ __PRETTY_FUNCTION__))
;
10694 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10694, __extension__ __PRETTY_FUNCTION__))
;
10695
10696 if (V2.isUndef()) {
10697 // Check for being able to broadcast a single element.
10698 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10699 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10700 return Broadcast;
10701
10702 // Straight shuffle of a single input vector. Simulate this by using the
10703 // single input as both of the "inputs" to this instruction..
10704 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10705
10706 if (Subtarget.hasAVX()) {
10707 // If we have AVX, we can use VPERMILPS which will allow folding a load
10708 // into the shuffle.
10709 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10710 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10711 }
10712
10713 return DAG.getNode(
10714 X86ISD::SHUFP, DL, MVT::v2f64,
10715 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10716 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10717 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10718 }
10719 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!")(static_cast <bool> (Mask[0] >= 0 && Mask[0]
< 2 && "Non-canonicalized blend!") ? void (0) : __assert_fail
("Mask[0] >= 0 && Mask[0] < 2 && \"Non-canonicalized blend!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10719, __extension__ __PRETTY_FUNCTION__))
;
10720 assert(Mask[1] >= 2 && "Non-canonicalized blend!")(static_cast <bool> (Mask[1] >= 2 && "Non-canonicalized blend!"
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"Non-canonicalized blend!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10720, __extension__ __PRETTY_FUNCTION__))
;
10721
10722 // If we have a single input, insert that into V1 if we can do so cheaply.
10723 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10724 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10725 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10726 return Insertion;
10727 // Try inverting the insertion since for v2 masks it is easy to do and we
10728 // can't reliably sort the mask one way or the other.
10729 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10730 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10731 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10732 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10733 return Insertion;
10734 }
10735
10736 // Try to use one of the special instruction patterns to handle two common
10737 // blend patterns if a zero-blend above didn't work.
10738 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10739 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10740 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10741 // We can either use a special instruction to load over the low double or
10742 // to move just the low double.
10743 return DAG.getNode(
10744 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10745 DL, MVT::v2f64, V2,
10746 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10747
10748 if (Subtarget.hasSSE41())
10749 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10750 Zeroable, Subtarget, DAG))
10751 return Blend;
10752
10753 // Use dedicated unpack instructions for masks that match their pattern.
10754 if (SDValue V =
10755 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10756 return V;
10757
10758 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10759 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10760 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10761}
10762
10763/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10764///
10765/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10766/// the integer unit to minimize domain crossing penalties. However, for blends
10767/// it falls back to the floating point shuffle operation with appropriate bit
10768/// casting.
10769static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10770 const APInt &Zeroable,
10771 SDValue V1, SDValue V2,
10772 const X86Subtarget &Subtarget,
10773 SelectionDAG &DAG) {
10774 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10774, __extension__ __PRETTY_FUNCTION__))
;
10775 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10775, __extension__ __PRETTY_FUNCTION__))
;
10776 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10776, __extension__ __PRETTY_FUNCTION__))
;
10777
10778 if (V2.isUndef()) {
10779 // Check for being able to broadcast a single element.
10780 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10781 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10782 return Broadcast;
10783
10784 // Straight shuffle of a single input vector. For everything from SSE2
10785 // onward this has a single fast instruction with no scary immediates.
10786 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10787 V1 = DAG.getBitcast(MVT::v4i32, V1);
10788 int WidenedMask[4] = {
10789 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10790 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10791 return DAG.getBitcast(
10792 MVT::v2i64,
10793 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10794 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10795 }
10796 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10796, __extension__ __PRETTY_FUNCTION__))
;
10797 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10797, __extension__ __PRETTY_FUNCTION__))
;
10798 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10798, __extension__ __PRETTY_FUNCTION__))
;
10799 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10799, __extension__ __PRETTY_FUNCTION__))
;
10800
10801 // Try to use shift instructions.
10802 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10803 Zeroable, Subtarget, DAG))
10804 return Shift;
10805
10806 // When loading a scalar and then shuffling it into a vector we can often do
10807 // the insertion cheaply.
10808 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10809 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10810 return Insertion;
10811 // Try inverting the insertion since for v2 masks it is easy to do and we
10812 // can't reliably sort the mask one way or the other.
10813 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10814 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10815 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10816 return Insertion;
10817
10818 // We have different paths for blend lowering, but they all must use the
10819 // *exact* same predicate.
10820 bool IsBlendSupported = Subtarget.hasSSE41();
10821 if (IsBlendSupported)
10822 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10823 Zeroable, Subtarget, DAG))
10824 return Blend;
10825
10826 // Use dedicated unpack instructions for masks that match their pattern.
10827 if (SDValue V =
10828 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10829 return V;
10830
10831 // Try to use byte rotation instructions.
10832 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10833 if (Subtarget.hasSSSE3()) {
10834 if (Subtarget.hasVLX())
10835 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
10836 Mask, Subtarget, DAG))
10837 return Rotate;
10838
10839 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10840 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10841 return Rotate;
10842 }
10843
10844 // If we have direct support for blends, we should lower by decomposing into
10845 // a permute. That will be faster than the domain cross.
10846 if (IsBlendSupported)
10847 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10848 Mask, DAG);
10849
10850 // We implement this with SHUFPD which is pretty lame because it will likely
10851 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10852 // However, all the alternatives are still more cycles and newer chips don't
10853 // have this problem. It would be really nice if x86 had better shuffles here.
10854 V1 = DAG.getBitcast(MVT::v2f64, V1);
10855 V2 = DAG.getBitcast(MVT::v2f64, V2);
10856 return DAG.getBitcast(MVT::v2i64,
10857 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10858}
10859
10860/// \brief Test whether this can be lowered with a single SHUFPS instruction.
10861///
10862/// This is used to disable more specialized lowerings when the shufps lowering
10863/// will happen to be efficient.
10864static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10865 // This routine only handles 128-bit shufps.
10866 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10866, __extension__ __PRETTY_FUNCTION__))
;
10867 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10867, __extension__ __PRETTY_FUNCTION__))
;
10868 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10868, __extension__ __PRETTY_FUNCTION__))
;
10869 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10869, __extension__ __PRETTY_FUNCTION__))
;
10870 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10870, __extension__ __PRETTY_FUNCTION__))
;
10871
10872 // To lower with a single SHUFPS we need to have the low half and high half
10873 // each requiring a single input.
10874 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10875 return false;
10876 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10877 return false;
10878
10879 return true;
10880}
10881
10882/// \brief Lower a vector shuffle using the SHUFPS instruction.
10883///
10884/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10885/// It makes no assumptions about whether this is the *best* lowering, it simply
10886/// uses it.
10887static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10888 ArrayRef<int> Mask, SDValue V1,
10889 SDValue V2, SelectionDAG &DAG) {
10890 SDValue LowV = V1, HighV = V2;
10891 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10892
10893 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10894
10895 if (NumV2Elements == 1) {
10896 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10897
10898 // Compute the index adjacent to V2Index and in the same half by toggling
10899 // the low bit.
10900 int V2AdjIndex = V2Index ^ 1;
10901
10902 if (Mask[V2AdjIndex] < 0) {
10903 // Handles all the cases where we have a single V2 element and an undef.
10904 // This will only ever happen in the high lanes because we commute the
10905 // vector otherwise.
10906 if (V2Index < 2)
10907 std::swap(LowV, HighV);
10908 NewMask[V2Index] -= 4;
10909 } else {
10910 // Handle the case where the V2 element ends up adjacent to a V1 element.
10911 // To make this work, blend them together as the first step.
10912 int V1Index = V2AdjIndex;
10913 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10914 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10915 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10916
10917 // Now proceed to reconstruct the final blend as we have the necessary
10918 // high or low half formed.
10919 if (V2Index < 2) {
10920 LowV = V2;
10921 HighV = V1;
10922 } else {
10923 HighV = V2;
10924 }
10925 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10926 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10927 }
10928 } else if (NumV2Elements == 2) {
10929 if (Mask[0] < 4 && Mask[1] < 4) {
10930 // Handle the easy case where we have V1 in the low lanes and V2 in the
10931 // high lanes.
10932 NewMask[2] -= 4;
10933 NewMask[3] -= 4;
10934 } else if (Mask[2] < 4 && Mask[3] < 4) {
10935 // We also handle the reversed case because this utility may get called
10936 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10937 // arrange things in the right direction.
10938 NewMask[0] -= 4;
10939 NewMask[1] -= 4;
10940 HighV = V1;
10941 LowV = V2;
10942 } else {
10943 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10944 // trying to place elements directly, just blend them and set up the final
10945 // shuffle to place them.
10946
10947 // The first two blend mask elements are for V1, the second two are for
10948 // V2.
10949 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10950 Mask[2] < 4 ? Mask[2] : Mask[3],
10951 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10952 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10953 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10954 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10955
10956 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10957 // a blend.
10958 LowV = HighV = V1;
10959 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10960 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10961 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10962 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10963 }
10964 }
10965 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10966 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10967}
10968
10969/// \brief Lower 4-lane 32-bit floating point shuffles.
10970///
10971/// Uses instructions exclusively from the floating point unit to minimize
10972/// domain crossing penalties, as these are sufficient to implement all v4f32
10973/// shuffles.
10974static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10975 const APInt &Zeroable,
10976 SDValue V1, SDValue V2,
10977 const X86Subtarget &Subtarget,
10978 SelectionDAG &DAG) {
10979 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10979, __extension__ __PRETTY_FUNCTION__))
;
10980 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10980, __extension__ __PRETTY_FUNCTION__))
;
10981 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 10981, __extension__ __PRETTY_FUNCTION__))
;
10982
10983 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10984
10985 if (NumV2Elements == 0) {
10986 // Check for being able to broadcast a single element.
10987 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10988 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10989 return Broadcast;
10990
10991 // Use even/odd duplicate instructions for masks that match their pattern.
10992 if (Subtarget.hasSSE3()) {
10993 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10994 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10995 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10996 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10997 }
10998
10999 if (Subtarget.hasAVX()) {
11000 // If we have AVX, we can use VPERMILPS which will allow folding a load
11001 // into the shuffle.
11002 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11003 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11004 }
11005
11006 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11007 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11008 if (!Subtarget.hasSSE2()) {
11009 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11010 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11011 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11012 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11013 }
11014
11015 // Otherwise, use a straight shuffle of a single input vector. We pass the
11016 // input vector to both operands to simulate this with a SHUFPS.
11017 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11018 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11019 }
11020
11021 // There are special ways we can lower some single-element blends. However, we
11022 // have custom ways we can lower more complex single-element blends below that
11023 // we defer to if both this and BLENDPS fail to match, so restrict this to
11024 // when the V2 input is targeting element 0 of the mask -- that is the fast
11025 // case here.
11026 if (NumV2Elements == 1 && Mask[0] >= 4)
11027 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11028 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11029 return V;
11030
11031 if (Subtarget.hasSSE41()) {
11032 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11033 Zeroable, Subtarget, DAG))
11034 return Blend;
11035
11036 // Use INSERTPS if we can complete the shuffle efficiently.
11037 if (SDValue V =
11038 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11039 return V;
11040
11041 if (!isSingleSHUFPSMask(Mask))
11042 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11043 DL, MVT::v4f32, V1, V2, Mask, DAG))
11044 return BlendPerm;
11045 }
11046
11047 // Use low/high mov instructions. These are only valid in SSE1 because
11048 // otherwise they are widened to v2f64 and never get here.
11049 if (!Subtarget.hasSSE2()) {
11050 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11051 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11052 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11053 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11054 }
11055
11056 // Use dedicated unpack instructions for masks that match their pattern.
11057 if (SDValue V =
11058 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11059 return V;
11060
11061 // Otherwise fall back to a SHUFPS lowering strategy.
11062 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11063}
11064
11065/// \brief Lower 4-lane i32 vector shuffles.
11066///
11067/// We try to handle these with integer-domain shuffles where we can, but for
11068/// blends we use the floating point domain blend instructions.
11069static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11070 const APInt &Zeroable,
11071 SDValue V1, SDValue V2,
11072 const X86Subtarget &Subtarget,
11073 SelectionDAG &DAG) {
11074 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11074, __extension__ __PRETTY_FUNCTION__))
;
11075 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11075, __extension__ __PRETTY_FUNCTION__))
;
11076 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11076, __extension__ __PRETTY_FUNCTION__))
;
11077
11078 // Whenever we can lower this as a zext, that instruction is strictly faster
11079 // than any alternative. It also allows us to fold memory operands into the
11080 // shuffle in many cases.
11081 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11082 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11083 return ZExt;
11084
11085 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11086
11087 if (NumV2Elements == 0) {
11088 // Check for being able to broadcast a single element.
11089 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11090 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11091 return Broadcast;
11092
11093 // Straight shuffle of a single input vector. For everything from SSE2
11094 // onward this has a single fast instruction with no scary immediates.
11095 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11096 // but we aren't actually going to use the UNPCK instruction because doing
11097 // so prevents folding a load into this instruction or making a copy.
11098 const int UnpackLoMask[] = {0, 0, 1, 1};
11099 const int UnpackHiMask[] = {2, 2, 3, 3};
11100 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11101 Mask = UnpackLoMask;
11102 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11103 Mask = UnpackHiMask;
11104
11105 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11106 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11107 }
11108
11109 // Try to use shift instructions.
11110 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11111 Zeroable, Subtarget, DAG))
11112 return Shift;
11113
11114 // There are special ways we can lower some single-element blends.
11115 if (NumV2Elements == 1)
11116 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11117 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11118 return V;
11119
11120 // We have different paths for blend lowering, but they all must use the
11121 // *exact* same predicate.
11122 bool IsBlendSupported = Subtarget.hasSSE41();
11123 if (IsBlendSupported)
11124 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11125 Zeroable, Subtarget, DAG))
11126 return Blend;
11127
11128 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11129 Zeroable, DAG))
11130 return Masked;
11131
11132 // Use dedicated unpack instructions for masks that match their pattern.
11133 if (SDValue V =
11134 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11135 return V;
11136
11137 // Try to use byte rotation instructions.
11138 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11139 if (Subtarget.hasSSSE3()) {
11140 if (Subtarget.hasVLX())
11141 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11142 Mask, Subtarget, DAG))
11143 return Rotate;
11144
11145 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11146 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11147 return Rotate;
11148 }
11149
11150 // Assume that a single SHUFPS is faster than an alternative sequence of
11151 // multiple instructions (even if the CPU has a domain penalty).
11152 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11153 if (!isSingleSHUFPSMask(Mask)) {
11154 // If we have direct support for blends, we should lower by decomposing into
11155 // a permute. That will be faster than the domain cross.
11156 if (IsBlendSupported)
11157 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11158 Mask, DAG);
11159
11160 // Try to lower by permuting the inputs into an unpack instruction.
11161 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11162 DL, MVT::v4i32, V1, V2, Mask, DAG))
11163 return Unpack;
11164 }
11165
11166 // We implement this with SHUFPS because it can blend from two vectors.
11167 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11168 // up the inputs, bypassing domain shift penalties that we would incur if we
11169 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11170 // relevant.
11171 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11172 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11173 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11174 return DAG.getBitcast(MVT::v4i32, ShufPS);
11175}
11176
11177/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11178/// shuffle lowering, and the most complex part.
11179///
11180/// The lowering strategy is to try to form pairs of input lanes which are
11181/// targeted at the same half of the final vector, and then use a dword shuffle
11182/// to place them onto the right half, and finally unpack the paired lanes into
11183/// their final position.
11184///
11185/// The exact breakdown of how to form these dword pairs and align them on the
11186/// correct sides is really tricky. See the comments within the function for
11187/// more of the details.
11188///
11189/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11190/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11191/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11192/// vector, form the analogous 128-bit 8-element Mask.
11193static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11194 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11195 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11196 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11196, __extension__ __PRETTY_FUNCTION__))
;
11197 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11198
11199 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11199, __extension__ __PRETTY_FUNCTION__))
;
11200 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11201 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11202
11203 SmallVector<int, 4> LoInputs;
11204 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11205 std::sort(LoInputs.begin(), LoInputs.end());
11206 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11207 SmallVector<int, 4> HiInputs;
11208 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11209 std::sort(HiInputs.begin(), HiInputs.end());
11210 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11211 int NumLToL =
11212 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11213 int NumHToL = LoInputs.size() - NumLToL;
11214 int NumLToH =
11215 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11216 int NumHToH = HiInputs.size() - NumLToH;
11217 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11218 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11219 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11220 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11221
11222 // If we are splatting two values from one half - one to each half, then
11223 // we can shuffle that half so each is splatted to a dword, then splat those
11224 // to their respective halves.
11225 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
11226 int DOffset) {
11227 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
11228 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
11229 V = DAG.getNode(ShufWOp, DL, VT, V,
11230 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11231 V = DAG.getBitcast(PSHUFDVT, V);
11232 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11233 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11234 return DAG.getBitcast(VT, V);
11235 };
11236
11237 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
11238 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
11239 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
11240 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
11241
11242 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11243 // such inputs we can swap two of the dwords across the half mark and end up
11244 // with <=2 inputs to each half in each half. Once there, we can fall through
11245 // to the generic code below. For example:
11246 //
11247 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11248 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11249 //
11250 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11251 // and an existing 2-into-2 on the other half. In this case we may have to
11252 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11253 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11254 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11255 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11256 // half than the one we target for fixing) will be fixed when we re-enter this
11257 // path. We will also combine away any sequence of PSHUFD instructions that
11258 // result into a single instruction. Here is an example of the tricky case:
11259 //
11260 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11261 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11262 //
11263 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11264 //
11265 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11266 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11267 //
11268 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11269 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11270 //
11271 // The result is fine to be handled by the generic logic.
11272 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11273 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11274 int AOffset, int BOffset) {
11275 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11276, __extension__ __PRETTY_FUNCTION__))
11276 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11276, __extension__ __PRETTY_FUNCTION__))
;
11277 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11278, __extension__ __PRETTY_FUNCTION__))
11278 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11278, __extension__ __PRETTY_FUNCTION__))
;
11279 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11280, __extension__ __PRETTY_FUNCTION__))
11280 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11280, __extension__ __PRETTY_FUNCTION__))
;
11281
11282 bool ThreeAInputs = AToAInputs.size() == 3;
11283
11284 // Compute the index of dword with only one word among the three inputs in
11285 // a half by taking the sum of the half with three inputs and subtracting
11286 // the sum of the actual three inputs. The difference is the remaining
11287 // slot.
11288 int ADWord, BDWord;
11289 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11290 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11291 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11292 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11293 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11294 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11295 int TripleNonInputIdx =
11296 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11297 TripleDWord = TripleNonInputIdx / 2;
11298
11299 // We use xor with one to compute the adjacent DWord to whichever one the
11300 // OneInput is in.
11301 OneInputDWord = (OneInput / 2) ^ 1;
11302
11303 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11304 // and BToA inputs. If there is also such a problem with the BToB and AToB
11305 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11306 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11307 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11308 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11309 // Compute how many inputs will be flipped by swapping these DWords. We
11310 // need
11311 // to balance this to ensure we don't form a 3-1 shuffle in the other
11312 // half.
11313 int NumFlippedAToBInputs =
11314 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11315 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11316 int NumFlippedBToBInputs =
11317 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11318 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11319 if ((NumFlippedAToBInputs == 1 &&
11320 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11321 (NumFlippedBToBInputs == 1 &&
11322 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11323 // We choose whether to fix the A half or B half based on whether that
11324 // half has zero flipped inputs. At zero, we may not be able to fix it
11325 // with that half. We also bias towards fixing the B half because that
11326 // will more commonly be the high half, and we have to bias one way.
11327 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11328 ArrayRef<int> Inputs) {
11329 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11330 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11331 // Determine whether the free index is in the flipped dword or the
11332 // unflipped dword based on where the pinned index is. We use this bit
11333 // in an xor to conditionally select the adjacent dword.
11334 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11335 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11336 if (IsFixIdxInput == IsFixFreeIdxInput)
11337 FixFreeIdx += 1;
11338 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11339 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11340, __extension__ __PRETTY_FUNCTION__))
11340 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11340, __extension__ __PRETTY_FUNCTION__))
;
11341 int PSHUFHalfMask[] = {0, 1, 2, 3};
11342 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11343 V = DAG.getNode(
11344 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11345 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11346 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11347
11348 for (int &M : Mask)
11349 if (M >= 0 && M == FixIdx)
11350 M = FixFreeIdx;
11351 else if (M >= 0 && M == FixFreeIdx)
11352 M = FixIdx;
11353 };
11354 if (NumFlippedBToBInputs != 0) {
11355 int BPinnedIdx =
11356 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11357 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11358 } else {
11359 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11359, __extension__ __PRETTY_FUNCTION__))
;
11360 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11361 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11362 }
11363 }
11364 }
11365
11366 int PSHUFDMask[] = {0, 1, 2, 3};
11367 PSHUFDMask[ADWord] = BDWord;
11368 PSHUFDMask[BDWord] = ADWord;
11369 V = DAG.getBitcast(
11370 VT,
11371 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11372 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11373
11374 // Adjust the mask to match the new locations of A and B.
11375 for (int &M : Mask)
11376 if (M >= 0 && M/2 == ADWord)
11377 M = 2 * BDWord + M % 2;
11378 else if (M >= 0 && M/2 == BDWord)
11379 M = 2 * ADWord + M % 2;
11380
11381 // Recurse back into this routine to re-compute state now that this isn't
11382 // a 3 and 1 problem.
11383 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11384 DAG);
11385 };
11386 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11387 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11388 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11389 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11390
11391 // At this point there are at most two inputs to the low and high halves from
11392 // each half. That means the inputs can always be grouped into dwords and
11393 // those dwords can then be moved to the correct half with a dword shuffle.
11394 // We use at most one low and one high word shuffle to collect these paired
11395 // inputs into dwords, and finally a dword shuffle to place them.
11396 int PSHUFLMask[4] = {-1, -1, -1, -1};
11397 int PSHUFHMask[4] = {-1, -1, -1, -1};
11398 int PSHUFDMask[4] = {-1, -1, -1, -1};
11399
11400 // First fix the masks for all the inputs that are staying in their
11401 // original halves. This will then dictate the targets of the cross-half
11402 // shuffles.
11403 auto fixInPlaceInputs =
11404 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11405 MutableArrayRef<int> SourceHalfMask,
11406 MutableArrayRef<int> HalfMask, int HalfOffset) {
11407 if (InPlaceInputs.empty())
11408 return;
11409 if (InPlaceInputs.size() == 1) {
11410 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11411 InPlaceInputs[0] - HalfOffset;
11412 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11413 return;
11414 }
11415 if (IncomingInputs.empty()) {
11416 // Just fix all of the in place inputs.
11417 for (int Input : InPlaceInputs) {
11418 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11419 PSHUFDMask[Input / 2] = Input / 2;
11420 }
11421 return;
11422 }
11423
11424 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11424, __extension__ __PRETTY_FUNCTION__))
;
11425 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11426 InPlaceInputs[0] - HalfOffset;
11427 // Put the second input next to the first so that they are packed into
11428 // a dword. We find the adjacent index by toggling the low bit.
11429 int AdjIndex = InPlaceInputs[0] ^ 1;
11430 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11431 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11432 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11433 };
11434 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11435 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11436
11437 // Now gather the cross-half inputs and place them into a free dword of
11438 // their target half.
11439 // FIXME: This operation could almost certainly be simplified dramatically to
11440 // look more like the 3-1 fixing operation.
11441 auto moveInputsToRightHalf = [&PSHUFDMask](
11442 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11443 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11444 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11445 int DestOffset) {
11446 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11447 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11448 };
11449 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11450 int Word) {
11451 int LowWord = Word & ~1;
11452 int HighWord = Word | 1;
11453 return isWordClobbered(SourceHalfMask, LowWord) ||
11454 isWordClobbered(SourceHalfMask, HighWord);
11455 };
11456
11457 if (IncomingInputs.empty())
11458 return;
11459
11460 if (ExistingInputs.empty()) {
11461 // Map any dwords with inputs from them into the right half.
11462 for (int Input : IncomingInputs) {
11463 // If the source half mask maps over the inputs, turn those into
11464 // swaps and use the swapped lane.
11465 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11466 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11467 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11468 Input - SourceOffset;
11469 // We have to swap the uses in our half mask in one sweep.
11470 for (int &M : HalfMask)
11471 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11472 M = Input;
11473 else if (M == Input)
11474 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11475 } else {
11476 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__))
11477 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__))
11478 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11478, __extension__ __PRETTY_FUNCTION__))
;
11479 }
11480 // Note that this correctly re-maps both when we do a swap and when
11481 // we observe the other side of the swap above. We rely on that to
11482 // avoid swapping the members of the input list directly.
11483 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11484 }
11485
11486 // Map the input's dword into the correct half.
11487 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11488 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11489 else
11490 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__))
11491 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__))
11492 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11492, __extension__ __PRETTY_FUNCTION__))
;
11493 }
11494
11495 // And just directly shift any other-half mask elements to be same-half
11496 // as we will have mirrored the dword containing the element into the
11497 // same position within that half.
11498 for (int &M : HalfMask)
11499 if (M >= SourceOffset && M < SourceOffset + 4) {
11500 M = M - SourceOffset + DestOffset;
11501 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11501, __extension__ __PRETTY_FUNCTION__))
;
11502 }
11503 return;
11504 }
11505
11506 // Ensure we have the input in a viable dword of its current half. This
11507 // is particularly tricky because the original position may be clobbered
11508 // by inputs being moved and *staying* in that half.
11509 if (IncomingInputs.size() == 1) {
11510 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11511 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11512 SourceOffset;
11513 SourceHalfMask[InputFixed - SourceOffset] =
11514 IncomingInputs[0] - SourceOffset;
11515 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11516 InputFixed);
11517 IncomingInputs[0] = InputFixed;
11518 }
11519 } else if (IncomingInputs.size() == 2) {
11520 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11521 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11522 // We have two non-adjacent or clobbered inputs we need to extract from
11523 // the source half. To do this, we need to map them into some adjacent
11524 // dword slot in the source mask.
11525 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11526 IncomingInputs[1] - SourceOffset};
11527
11528 // If there is a free slot in the source half mask adjacent to one of
11529 // the inputs, place the other input in it. We use (Index XOR 1) to
11530 // compute an adjacent index.
11531 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11532 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11533 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11534 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11535 InputsFixed[1] = InputsFixed[0] ^ 1;
11536 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11537 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11538 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11539 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11540 InputsFixed[0] = InputsFixed[1] ^ 1;
11541 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11542 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11543 // The two inputs are in the same DWord but it is clobbered and the
11544 // adjacent DWord isn't used at all. Move both inputs to the free
11545 // slot.
11546 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11547 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11548 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11549 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11550 } else {
11551 // The only way we hit this point is if there is no clobbering
11552 // (because there are no off-half inputs to this half) and there is no
11553 // free slot adjacent to one of the inputs. In this case, we have to
11554 // swap an input with a non-input.
11555 for (int i = 0; i < 4; ++i)
11556 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11557, __extension__ __PRETTY_FUNCTION__))
11557 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11557, __extension__ __PRETTY_FUNCTION__))
;
11558 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11559, __extension__ __PRETTY_FUNCTION__))
11559 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11559, __extension__ __PRETTY_FUNCTION__))
;
11560
11561 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11562 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11563
11564 // We also have to update the final source mask in this case because
11565 // it may need to undo the above swap.
11566 for (int &M : FinalSourceHalfMask)
11567 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11568 M = InputsFixed[1] + SourceOffset;
11569 else if (M == InputsFixed[1] + SourceOffset)
11570 M = (InputsFixed[0] ^ 1) + SourceOffset;
11571
11572 InputsFixed[1] = InputsFixed[0] ^ 1;
11573 }
11574
11575 // Point everything at the fixed inputs.
11576 for (int &M : HalfMask)
11577 if (M == IncomingInputs[0])
11578 M = InputsFixed[0] + SourceOffset;
11579 else if (M == IncomingInputs[1])
11580 M = InputsFixed[1] + SourceOffset;
11581
11582 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11583 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11584 }
11585 } else {
11586 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11586)
;
11587 }
11588
11589 // Now hoist the DWord down to the right half.
11590 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11591 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11591, __extension__ __PRETTY_FUNCTION__))
;
11592 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11593 for (int &M : HalfMask)
11594 for (int Input : IncomingInputs)
11595 if (M == Input)
11596 M = FreeDWord * 2 + Input % 2;
11597 };
11598 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11599 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11600 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11601 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11602
11603 // Now enact all the shuffles we've computed to move the inputs into their
11604 // target half.
11605 if (!isNoopShuffleMask(PSHUFLMask))
11606 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11607 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11608 if (!isNoopShuffleMask(PSHUFHMask))
11609 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11610 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11611 if (!isNoopShuffleMask(PSHUFDMask))
11612 V = DAG.getBitcast(
11613 VT,
11614 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11615 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11616
11617 // At this point, each half should contain all its inputs, and we can then
11618 // just shuffle them into their final position.
11619 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11620, __extension__ __PRETTY_FUNCTION__))
11620 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11620, __extension__ __PRETTY_FUNCTION__))
;
11621 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11622, __extension__ __PRETTY_FUNCTION__))
11622 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11622, __extension__ __PRETTY_FUNCTION__))
;
11623
11624 // Do a half shuffle for the low mask.
11625 if (!isNoopShuffleMask(LoMask))
11626 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11627 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11628
11629 // Do a half shuffle with the high mask after shifting its values down.
11630 for (int &M : HiMask)
11631 if (M >= 0)
11632 M -= 4;
11633 if (!isNoopShuffleMask(HiMask))
11634 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11635 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11636
11637 return V;
11638}
11639
11640/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11641/// blend if only one input is used.
11642static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11643 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11644 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11645 bool &V2InUse) {
11646 SDValue V1Mask[16];
11647 SDValue V2Mask[16];
11648 V1InUse = false;
11649 V2InUse = false;
11650
11651 int Size = Mask.size();
11652 int Scale = 16 / Size;
11653 for (int i = 0; i < 16; ++i) {
11654 if (Mask[i / Scale] < 0) {
11655 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11656 } else {
11657 const int ZeroMask = 0x80;
11658 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11659 : ZeroMask;
11660 int V2Idx = Mask[i / Scale] < Size
11661 ? ZeroMask
11662 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11663 if (Zeroable[i / Scale])
11664 V1Idx = V2Idx = ZeroMask;
11665 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11666 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11667 V1InUse |= (ZeroMask != V1Idx);
11668 V2InUse |= (ZeroMask != V2Idx);
11669 }
11670 }
11671
11672 if (V1InUse)
11673 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11674 DAG.getBitcast(MVT::v16i8, V1),
11675 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11676 if (V2InUse)
11677 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11678 DAG.getBitcast(MVT::v16i8, V2),
11679 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11680
11681 // If we need shuffled inputs from both, blend the two.
11682 SDValue V;
11683 if (V1InUse && V2InUse)
11684 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11685 else
11686 V = V1InUse ? V1 : V2;
11687
11688 // Cast the result back to the correct type.
11689 return DAG.getBitcast(VT, V);
11690}
11691
11692/// \brief Generic lowering of 8-lane i16 shuffles.
11693///
11694/// This handles both single-input shuffles and combined shuffle/blends with
11695/// two inputs. The single input shuffles are immediately delegated to
11696/// a dedicated lowering routine.
11697///
11698/// The blends are lowered in one of three fundamental ways. If there are few
11699/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11700/// of the input is significantly cheaper when lowered as an interleaving of
11701/// the two inputs, try to interleave them. Otherwise, blend the low and high
11702/// halves of the inputs separately (making them have relatively few inputs)
11703/// and then concatenate them.
11704static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11705 const APInt &Zeroable,
11706 SDValue V1, SDValue V2,
11707 const X86Subtarget &Subtarget,
11708 SelectionDAG &DAG) {
11709 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11709, __extension__ __PRETTY_FUNCTION__))
;
11710 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11710, __extension__ __PRETTY_FUNCTION__))
;
11711 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11711, __extension__ __PRETTY_FUNCTION__))
;
11712
11713 // Whenever we can lower this as a zext, that instruction is strictly faster
11714 // than any alternative.
11715 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11716 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11717 return ZExt;
11718
11719 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11720
11721 if (NumV2Inputs == 0) {
11722 // Check for being able to broadcast a single element.
11723 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11724 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11725 return Broadcast;
11726
11727 // Try to use shift instructions.
11728 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11729 Zeroable, Subtarget, DAG))
11730 return Shift;
11731
11732 // Use dedicated unpack instructions for masks that match their pattern.
11733 if (SDValue V =
11734 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11735 return V;
11736
11737 // Use dedicated pack instructions for masks that match their pattern.
11738 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11739 DAG, Subtarget))
11740 return V;
11741
11742 // Try to use byte rotation instructions.
11743 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11744 Mask, Subtarget, DAG))
11745 return Rotate;
11746
11747 // Make a copy of the mask so it can be modified.
11748 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11749 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11750 MutableMask, Subtarget,
11751 DAG);
11752 }
11753
11754 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__))
11755 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__))
11756 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11756, __extension__ __PRETTY_FUNCTION__))
;
11757
11758 // Try to use shift instructions.
11759 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11760 Zeroable, Subtarget, DAG))
11761 return Shift;
11762
11763 // See if we can use SSE4A Extraction / Insertion.
11764 if (Subtarget.hasSSE4A())
11765 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11766 Zeroable, DAG))
11767 return V;
11768
11769 // There are special ways we can lower some single-element blends.
11770 if (NumV2Inputs == 1)
11771 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11772 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11773 return V;
11774
11775 // We have different paths for blend lowering, but they all must use the
11776 // *exact* same predicate.
11777 bool IsBlendSupported = Subtarget.hasSSE41();
11778 if (IsBlendSupported)
11779 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11780 Zeroable, Subtarget, DAG))
11781 return Blend;
11782
11783 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11784 Zeroable, DAG))
11785 return Masked;
11786
11787 // Use dedicated unpack instructions for masks that match their pattern.
11788 if (SDValue V =
11789 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11790 return V;
11791
11792 // Use dedicated pack instructions for masks that match their pattern.
11793 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
11794 Subtarget))
11795 return V;
11796
11797 // Try to use byte rotation instructions.
11798 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11799 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11800 return Rotate;
11801
11802 if (SDValue BitBlend =
11803 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11804 return BitBlend;
11805
11806 // Try to lower by permuting the inputs into an unpack instruction.
11807 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11808 V2, Mask, DAG))
11809 return Unpack;
11810
11811 // If we can't directly blend but can use PSHUFB, that will be better as it
11812 // can both shuffle and set up the inefficient blend.
11813 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11814 bool V1InUse, V2InUse;
11815 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11816 Zeroable, DAG, V1InUse, V2InUse);
11817 }
11818
11819 // We can always bit-blend if we have to so the fallback strategy is to
11820 // decompose into single-input permutes and blends.
11821 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11822 Mask, DAG);
11823}
11824
11825/// \brief Check whether a compaction lowering can be done by dropping even
11826/// elements and compute how many times even elements must be dropped.
11827///
11828/// This handles shuffles which take every Nth element where N is a power of
11829/// two. Example shuffle masks:
11830///
11831/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11832/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11833/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11834/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11835/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11836/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11837///
11838/// Any of these lanes can of course be undef.
11839///
11840/// This routine only supports N <= 3.
11841/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11842/// for larger N.
11843///
11844/// \returns N above, or the number of times even elements must be dropped if
11845/// there is such a number. Otherwise returns zero.
11846static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11847 bool IsSingleInput) {
11848 // The modulus for the shuffle vector entries is based on whether this is
11849 // a single input or not.
11850 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11851 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11852, __extension__ __PRETTY_FUNCTION__))
11852 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11852, __extension__ __PRETTY_FUNCTION__))
;
11853
11854 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11855
11856 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11857 // and 2^3 simultaneously. This is because we may have ambiguity with
11858 // partially undef inputs.
11859 bool ViableForN[3] = {true, true, true};
11860
11861 for (int i = 0, e = Mask.size(); i < e; ++i) {
11862 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11863 // want.
11864 if (Mask[i] < 0)
11865 continue;
11866
11867 bool IsAnyViable = false;
11868 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11869 if (ViableForN[j]) {
11870 uint64_t N = j + 1;
11871
11872 // The shuffle mask must be equal to (i * 2^N) % M.
11873 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11874 IsAnyViable = true;
11875 else
11876 ViableForN[j] = false;
11877 }
11878 // Early exit if we exhaust the possible powers of two.
11879 if (!IsAnyViable)
11880 break;
11881 }
11882
11883 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11884 if (ViableForN[j])
11885 return j + 1;
11886
11887 // Return 0 as there is no viable power of two.
11888 return 0;
11889}
11890
11891/// \brief Generic lowering of v16i8 shuffles.
11892///
11893/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11894/// detect any complexity reducing interleaving. If that doesn't help, it uses
11895/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11896/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11897/// back together.
11898static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11899 const APInt &Zeroable,
11900 SDValue V1, SDValue V2,
11901 const X86Subtarget &Subtarget,
11902 SelectionDAG &DAG) {
11903 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11903, __extension__ __PRETTY_FUNCTION__))
;
11904 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11904, __extension__ __PRETTY_FUNCTION__))
;
11905 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 11905, __extension__ __PRETTY_FUNCTION__))
;
11906
11907 // Try to use shift instructions.
11908 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11909 Zeroable, Subtarget, DAG))
11910 return Shift;
11911
11912 // Try to use byte rotation instructions.
11913 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11914 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11915 return Rotate;
11916
11917 // Use dedicated pack instructions for masks that match their pattern.
11918 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
11919 Subtarget))
11920 return V;
11921
11922 // Try to use a zext lowering.
11923 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11924 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11925 return ZExt;
11926
11927 // See if we can use SSE4A Extraction / Insertion.
11928 if (Subtarget.hasSSE4A())
11929 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11930 Zeroable, DAG))
11931 return V;
11932
11933 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11934
11935 // For single-input shuffles, there are some nicer lowering tricks we can use.
11936 if (NumV2Elements == 0) {
11937 // Check for being able to broadcast a single element.
11938 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11939 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11940 return Broadcast;
11941
11942 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11943 // Notably, this handles splat and partial-splat shuffles more efficiently.
11944 // However, it only makes sense if the pre-duplication shuffle simplifies
11945 // things significantly. Currently, this means we need to be able to
11946 // express the pre-duplication shuffle as an i16 shuffle.
11947 //
11948 // FIXME: We should check for other patterns which can be widened into an
11949 // i16 shuffle as well.
11950 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11951 for (int i = 0; i < 16; i += 2)
11952 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11953 return false;
11954
11955 return true;
11956 };
11957 auto tryToWidenViaDuplication = [&]() -> SDValue {
11958 if (!canWidenViaDuplication(Mask))
11959 return SDValue();
11960 SmallVector<int, 4> LoInputs;
11961 copy_if(Mask, std::back_inserter(LoInputs),
11962 [](int M) { return M >= 0 && M < 8; });
11963 std::sort(LoInputs.begin(), LoInputs.end());
11964 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11965 LoInputs.end());
11966 SmallVector<int, 4> HiInputs;
11967 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11968 std::sort(HiInputs.begin(), HiInputs.end());
11969 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11970 HiInputs.end());
11971
11972 bool TargetLo = LoInputs.size() >= HiInputs.size();
11973 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11974 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11975
11976 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11977 SmallDenseMap<int, int, 8> LaneMap;
11978 for (int I : InPlaceInputs) {
11979 PreDupI16Shuffle[I/2] = I/2;
11980 LaneMap[I] = I;
11981 }
11982 int j = TargetLo ? 0 : 4, je = j + 4;
11983 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11984 // Check if j is already a shuffle of this input. This happens when
11985 // there are two adjacent bytes after we move the low one.
11986 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11987 // If we haven't yet mapped the input, search for a slot into which
11988 // we can map it.
11989 while (j < je && PreDupI16Shuffle[j] >= 0)
11990 ++j;
11991
11992 if (j == je)
11993 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11994 return SDValue();
11995
11996 // Map this input with the i16 shuffle.
11997 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11998 }
11999
12000 // Update the lane map based on the mapping we ended up with.
12001 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12002 }
12003 V1 = DAG.getBitcast(
12004 MVT::v16i8,
12005 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12006 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12007
12008 // Unpack the bytes to form the i16s that will be shuffled into place.
12009 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12010 MVT::v16i8, V1, V1);
12011
12012 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12013 for (int i = 0; i < 16; ++i)
12014 if (Mask[i] >= 0) {
12015 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12016 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12016, __extension__ __PRETTY_FUNCTION__))
;
12017 if (PostDupI16Shuffle[i / 2] < 0)
12018 PostDupI16Shuffle[i / 2] = MappedMask;
12019 else
12020 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12021, __extension__ __PRETTY_FUNCTION__))
12021 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12021, __extension__ __PRETTY_FUNCTION__))
;
12022 }
12023 return DAG.getBitcast(
12024 MVT::v16i8,
12025 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12026 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12027 };
12028 if (SDValue V = tryToWidenViaDuplication())
12029 return V;
12030 }
12031
12032 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12033 Zeroable, DAG))
12034 return Masked;
12035
12036 // Use dedicated unpack instructions for masks that match their pattern.
12037 if (SDValue V =
12038 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12039 return V;
12040
12041 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12042 // with PSHUFB. It is important to do this before we attempt to generate any
12043 // blends but after all of the single-input lowerings. If the single input
12044 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12045 // want to preserve that and we can DAG combine any longer sequences into
12046 // a PSHUFB in the end. But once we start blending from multiple inputs,
12047 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12048 // and there are *very* few patterns that would actually be faster than the
12049 // PSHUFB approach because of its ability to zero lanes.
12050 //
12051 // FIXME: The only exceptions to the above are blends which are exact
12052 // interleavings with direct instructions supporting them. We currently don't
12053 // handle those well here.
12054 if (Subtarget.hasSSSE3()) {
12055 bool V1InUse = false;
12056 bool V2InUse = false;
12057
12058 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12059 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12060
12061 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12062 // do so. This avoids using them to handle blends-with-zero which is
12063 // important as a single pshufb is significantly faster for that.
12064 if (V1InUse && V2InUse) {
12065 if (Subtarget.hasSSE41())
12066 if (SDValue Blend = lowerVectorShuffleAsBlend(
12067 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12068 return Blend;
12069
12070 // We can use an unpack to do the blending rather than an or in some
12071 // cases. Even though the or may be (very minorly) more efficient, we
12072 // preference this lowering because there are common cases where part of
12073 // the complexity of the shuffles goes away when we do the final blend as
12074 // an unpack.
12075 // FIXME: It might be worth trying to detect if the unpack-feeding
12076 // shuffles will both be pshufb, in which case we shouldn't bother with
12077 // this.
12078 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12079 DL, MVT::v16i8, V1, V2, Mask, DAG))
12080 return Unpack;
12081 }
12082
12083 return PSHUFB;
12084 }
12085
12086 // There are special ways we can lower some single-element blends.
12087 if (NumV2Elements == 1)
12088 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12089 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12090 return V;
12091
12092 if (SDValue BitBlend =
12093 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12094 return BitBlend;
12095
12096 // Check whether a compaction lowering can be done. This handles shuffles
12097 // which take every Nth element for some even N. See the helper function for
12098 // details.
12099 //
12100 // We special case these as they can be particularly efficiently handled with
12101 // the PACKUSB instruction on x86 and they show up in common patterns of
12102 // rearranging bytes to truncate wide elements.
12103 bool IsSingleInput = V2.isUndef();
12104 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12105 // NumEvenDrops is the power of two stride of the elements. Another way of
12106 // thinking about it is that we need to drop the even elements this many
12107 // times to get the original input.
12108
12109 // First we need to zero all the dropped bytes.
12110 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12111, __extension__ __PRETTY_FUNCTION__))
12111 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12111, __extension__ __PRETTY_FUNCTION__))
;
12112 // We use the mask type to pick which bytes are preserved based on how many
12113 // elements are dropped.
12114 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12115 SDValue ByteClearMask = DAG.getBitcast(
12116 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12117 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12118 if (!IsSingleInput)
12119 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12120
12121 // Now pack things back together.
12122 V1 = DAG.getBitcast(MVT::v8i16, V1);
12123 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12124 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12125 for (int i = 1; i < NumEvenDrops; ++i) {
12126 Result = DAG.getBitcast(MVT::v8i16, Result);
12127 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12128 }
12129
12130 return Result;
12131 }
12132
12133 // Handle multi-input cases by blending single-input shuffles.
12134 if (NumV2Elements > 0)
12135 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12136 Mask, DAG);
12137
12138 // The fallback path for single-input shuffles widens this into two v8i16
12139 // vectors with unpacks, shuffles those, and then pulls them back together
12140 // with a pack.
12141 SDValue V = V1;
12142
12143 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12144 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12145 for (int i = 0; i < 16; ++i)
12146 if (Mask[i] >= 0)
12147 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12148
12149 SDValue VLoHalf, VHiHalf;
12150 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12151 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12152 // i16s.
12153 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12154 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12155 // Use a mask to drop the high bytes.
12156 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12157 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12158 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12159
12160 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12161 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12162
12163 // Squash the masks to point directly into VLoHalf.
12164 for (int &M : LoBlendMask)
12165 if (M >= 0)
12166 M /= 2;
12167 for (int &M : HiBlendMask)
12168 if (M >= 0)
12169 M /= 2;
12170 } else {
12171 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12172 // VHiHalf so that we can blend them as i16s.
12173 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12174
12175 VLoHalf = DAG.getBitcast(
12176 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12177 VHiHalf = DAG.getBitcast(
12178 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12179 }
12180
12181 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12182 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12183
12184 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12185}
12186
12187/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12188///
12189/// This routine breaks down the specific type of 128-bit shuffle and
12190/// dispatches to the lowering routines accordingly.
12191static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12192 MVT VT, SDValue V1, SDValue V2,
12193 const APInt &Zeroable,
12194 const X86Subtarget &Subtarget,
12195 SelectionDAG &DAG) {
12196 switch (VT.SimpleTy) {
12197 case MVT::v2i64:
12198 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12199 case MVT::v2f64:
12200 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12201 case MVT::v4i32:
12202 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12203 case MVT::v4f32:
12204 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12205 case MVT::v8i16:
12206 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12207 case MVT::v16i8:
12208 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12209
12210 default:
12211 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12211)
;
12212 }
12213}
12214
12215/// \brief Generic routine to split vector shuffle into half-sized shuffles.
12216///
12217/// This routine just extracts two subvectors, shuffles them independently, and
12218/// then concatenates them back together. This should work effectively with all
12219/// AVX vector shuffle types.
12220static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12221 SDValue V2, ArrayRef<int> Mask,
12222 SelectionDAG &DAG) {
12223 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12224, __extension__ __PRETTY_FUNCTION__))
12224 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12224, __extension__ __PRETTY_FUNCTION__))
;
12225 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12225, __extension__ __PRETTY_FUNCTION__))
;
12226 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12226, __extension__ __PRETTY_FUNCTION__))
;
12227
12228 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12229 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12230
12231 int NumElements = VT.getVectorNumElements();
12232 int SplitNumElements = NumElements / 2;
12233 MVT ScalarVT = VT.getVectorElementType();
12234 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12235
12236 // Rather than splitting build-vectors, just build two narrower build
12237 // vectors. This helps shuffling with splats and zeros.
12238 auto SplitVector = [&](SDValue V) {
12239 V = peekThroughBitcasts(V);
12240
12241 MVT OrigVT = V.getSimpleValueType();
12242 int OrigNumElements = OrigVT.getVectorNumElements();
12243 int OrigSplitNumElements = OrigNumElements / 2;
12244 MVT OrigScalarVT = OrigVT.getVectorElementType();
12245 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12246
12247 SDValue LoV, HiV;
12248
12249 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12250 if (!BV) {
12251 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12252 DAG.getIntPtrConstant(0, DL));
12253 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12254 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12255 } else {
12256
12257 SmallVector<SDValue, 16> LoOps, HiOps;
12258 for (int i = 0; i < OrigSplitNumElements; ++i) {
12259 LoOps.push_back(BV->getOperand(i));
12260 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12261 }
12262 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12263 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12264 }
12265 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12266 DAG.getBitcast(SplitVT, HiV));
12267 };
12268
12269 SDValue LoV1, HiV1, LoV2, HiV2;
12270 std::tie(LoV1, HiV1) = SplitVector(V1);
12271 std::tie(LoV2, HiV2) = SplitVector(V2);
12272
12273 // Now create two 4-way blends of these half-width vectors.
12274 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12275 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12276 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12277 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12278 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12279 for (int i = 0; i < SplitNumElements; ++i) {
12280 int M = HalfMask[i];
12281 if (M >= NumElements) {
12282 if (M >= NumElements + SplitNumElements)
12283 UseHiV2 = true;
12284 else
12285 UseLoV2 = true;
12286 V2BlendMask[i] = M - NumElements;
12287 BlendMask[i] = SplitNumElements + i;
12288 } else if (M >= 0) {
12289 if (M >= SplitNumElements)
12290 UseHiV1 = true;
12291 else
12292 UseLoV1 = true;
12293 V1BlendMask[i] = M;
12294 BlendMask[i] = i;
12295 }
12296 }
12297
12298 // Because the lowering happens after all combining takes place, we need to
12299 // manually combine these blend masks as much as possible so that we create
12300 // a minimal number of high-level vector shuffle nodes.
12301
12302 // First try just blending the halves of V1 or V2.
12303 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12304 return DAG.getUNDEF(SplitVT);
12305 if (!UseLoV2 && !UseHiV2)
12306 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12307 if (!UseLoV1 && !UseHiV1)
12308 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12309
12310 SDValue V1Blend, V2Blend;
12311 if (UseLoV1 && UseHiV1) {
12312 V1Blend =
12313 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12314 } else {
12315 // We only use half of V1 so map the usage down into the final blend mask.
12316 V1Blend = UseLoV1 ? LoV1 : HiV1;
12317 for (int i = 0; i < SplitNumElements; ++i)
12318 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12319 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12320 }
12321 if (UseLoV2 && UseHiV2) {
12322 V2Blend =
12323 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12324 } else {
12325 // We only use half of V2 so map the usage down into the final blend mask.
12326 V2Blend = UseLoV2 ? LoV2 : HiV2;
12327 for (int i = 0; i < SplitNumElements; ++i)
12328 if (BlendMask[i] >= SplitNumElements)
12329 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12330 }
12331 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12332 };
12333 SDValue Lo = HalfBlend(LoMask);
12334 SDValue Hi = HalfBlend(HiMask);
12335 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12336}
12337
12338/// \brief Either split a vector in halves or decompose the shuffles and the
12339/// blend.
12340///
12341/// This is provided as a good fallback for many lowerings of non-single-input
12342/// shuffles with more than one 128-bit lane. In those cases, we want to select
12343/// between splitting the shuffle into 128-bit components and stitching those
12344/// back together vs. extracting the single-input shuffles and blending those
12345/// results.
12346static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12347 SDValue V1, SDValue V2,
12348 ArrayRef<int> Mask,
12349 SelectionDAG &DAG) {
12350 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12351, __extension__ __PRETTY_FUNCTION__))
12351 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12351, __extension__ __PRETTY_FUNCTION__))
;
12352 int Size = Mask.size();
12353
12354 // If this can be modeled as a broadcast of two elements followed by a blend,
12355 // prefer that lowering. This is especially important because broadcasts can
12356 // often fold with memory operands.
12357 auto DoBothBroadcast = [&] {
12358 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12359 for (int M : Mask)
12360 if (M >= Size) {
12361 if (V2BroadcastIdx < 0)
12362 V2BroadcastIdx = M - Size;
12363 else if (M - Size != V2BroadcastIdx)
12364 return false;
12365 } else if (M >= 0) {
12366 if (V1BroadcastIdx < 0)
12367 V1BroadcastIdx = M;
12368 else if (M != V1BroadcastIdx)
12369 return false;
12370 }
12371 return true;
12372 };
12373 if (DoBothBroadcast())
12374 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12375 DAG);
12376
12377 // If the inputs all stem from a single 128-bit lane of each input, then we
12378 // split them rather than blending because the split will decompose to
12379 // unusually few instructions.
12380 int LaneCount = VT.getSizeInBits() / 128;
12381 int LaneSize = Size / LaneCount;
12382 SmallBitVector LaneInputs[2];
12383 LaneInputs[0].resize(LaneCount, false);
12384 LaneInputs[1].resize(LaneCount, false);
12385 for (int i = 0; i < Size; ++i)
12386 if (Mask[i] >= 0)
12387 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12388 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12389 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12390
12391 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12392 // that the decomposed single-input shuffles don't end up here.
12393 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12394}
12395
12396/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12397/// a permutation and blend of those lanes.
12398///
12399/// This essentially blends the out-of-lane inputs to each lane into the lane
12400/// from a permuted copy of the vector. This lowering strategy results in four
12401/// instructions in the worst case for a single-input cross lane shuffle which
12402/// is lower than any other fully general cross-lane shuffle strategy I'm aware
12403/// of. Special cases for each particular shuffle pattern should be handled
12404/// prior to trying this lowering.
12405static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12406 SDValue V1, SDValue V2,
12407 ArrayRef<int> Mask,
12408 SelectionDAG &DAG,
12409 const X86Subtarget &Subtarget) {
12410 // FIXME: This should probably be generalized for 512-bit vectors as well.
12411 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12411, __extension__ __PRETTY_FUNCTION__))
;
12412 int Size = Mask.size();
12413 int LaneSize = Size / 2;
12414
12415 // If there are only inputs from one 128-bit lane, splitting will in fact be
12416 // less expensive. The flags track whether the given lane contains an element
12417 // that crosses to another lane.
12418 if (!Subtarget.hasAVX2()) {
12419 bool LaneCrossing[2] = {false, false};
12420 for (int i = 0; i < Size; ++i)
12421 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12422 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12423 if (!LaneCrossing[0] || !LaneCrossing[1])
12424 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12425 } else {
12426 bool LaneUsed[2] = {false, false};
12427 for (int i = 0; i < Size; ++i)
12428 if (Mask[i] >= 0)
12429 LaneUsed[(Mask[i] / LaneSize)] = true;
12430 if (!LaneUsed[0] || !LaneUsed[1])
12431 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12432 }
12433
12434 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __extension__ __PRETTY_FUNCTION__))
12435 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12435, __extension__ __PRETTY_FUNCTION__))
;
12436
12437 SmallVector<int, 32> FlippedBlendMask(Size);
12438 for (int i = 0; i < Size; ++i)
12439 FlippedBlendMask[i] =
12440 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12441 ? Mask[i]
12442 : Mask[i] % LaneSize +
12443 (i / LaneSize) * LaneSize + Size);
12444
12445 // Flip the vector, and blend the results which should now be in-lane.
12446 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12447 SDValue Flipped = DAG.getBitcast(PVT, V1);
12448 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12449 { 2, 3, 0, 1 });
12450 Flipped = DAG.getBitcast(VT, Flipped);
12451 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12452}
12453
12454/// \brief Handle lowering 2-lane 128-bit shuffles.
12455static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12456 SDValue V2, ArrayRef<int> Mask,
12457 const APInt &Zeroable,
12458 const X86Subtarget &Subtarget,
12459 SelectionDAG &DAG) {
12460 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12461 if (Subtarget.hasAVX2() && V2.isUndef())
12462 return SDValue();
12463
12464 SmallVector<int, 4> WidenedMask;
12465 if (!canWidenShuffleElements(Mask, WidenedMask))
12466 return SDValue();
12467
12468 // TODO: If minimizing size and one of the inputs is a zero vector and the
12469 // the zero vector has only one use, we could use a VPERM2X128 to save the
12470 // instruction bytes needed to explicitly generate the zero vector.
12471
12472 // Blends are faster and handle all the non-lane-crossing cases.
12473 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12474 Zeroable, Subtarget, DAG))
12475 return Blend;
12476
12477 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12478 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12479
12480 // If either input operand is a zero vector, use VPERM2X128 because its mask
12481 // allows us to replace the zero input with an implicit zero.
12482 if (!IsLowZero && !IsHighZero) {
12483 // Check for patterns which can be matched with a single insert of a 128-bit
12484 // subvector.
12485 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12486 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12487
12488 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12489 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12490 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12491 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12492 VT.getVectorNumElements() / 2);
12493 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12494 DAG.getIntPtrConstant(0, DL));
12495 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12496 OnlyUsesV1 ? V1 : V2,
12497 DAG.getIntPtrConstant(0, DL));
12498 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12499 }
12500 }
12501
12502 // Try to use SHUF128 if possible.
12503 if (Subtarget.hasVLX()) {
12504 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12505 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12506 ((WidenedMask[1] % 2) << 1);
12507 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12508 DAG.getConstant(PermMask, DL, MVT::i8));
12509 }
12510 }
12511 }
12512
12513 // Otherwise form a 128-bit permutation. After accounting for undefs,
12514 // convert the 64-bit shuffle mask selection values into 128-bit
12515 // selection bits by dividing the indexes by 2 and shifting into positions
12516 // defined by a vperm2*128 instruction's immediate control byte.
12517
12518 // The immediate permute control byte looks like this:
12519 // [1:0] - select 128 bits from sources for low half of destination
12520 // [2] - ignore
12521 // [3] - zero low half of destination
12522 // [5:4] - select 128 bits from sources for high half of destination
12523 // [6] - ignore
12524 // [7] - zero high half of destination
12525
12526 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?")(static_cast <bool> (WidenedMask[0] >= 0 && WidenedMask
[1] >= 0 && "Undef half?") ? void (0) : __assert_fail
("WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12526, __extension__ __PRETTY_FUNCTION__))
;
12527
12528 unsigned PermMask = 0;
12529 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12530 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12531
12532 // Check the immediate mask and replace unused sources with undef.
12533 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12534 V1 = DAG.getUNDEF(VT);
12535 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12536 V2 = DAG.getUNDEF(VT);
12537
12538 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12539 DAG.getConstant(PermMask, DL, MVT::i8));
12540}
12541
12542/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12543/// shuffling each lane.
12544///
12545/// This will only succeed when the result of fixing the 128-bit lanes results
12546/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12547/// each 128-bit lanes. This handles many cases where we can quickly blend away
12548/// the lane crosses early and then use simpler shuffles within each lane.
12549///
12550/// FIXME: It might be worthwhile at some point to support this without
12551/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12552/// in x86 only floating point has interesting non-repeating shuffles, and even
12553/// those are still *marginally* more expensive.
12554static SDValue lowerVectorShuffleByMerging128BitLanes(
12555 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12556 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12557 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12557, __extension__ __PRETTY_FUNCTION__))
;
12558
12559 int Size = Mask.size();
12560 int LaneSize = 128 / VT.getScalarSizeInBits();
12561 int NumLanes = Size / LaneSize;
12562 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.")(static_cast <bool> (NumLanes > 1 && "Only handles 256-bit and wider shuffles."
) ? void (0) : __assert_fail ("NumLanes > 1 && \"Only handles 256-bit and wider shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12562, __extension__ __PRETTY_FUNCTION__))
;
12563
12564 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12565 // check whether the in-128-bit lane shuffles share a repeating pattern.
12566 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12567 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12568 for (int i = 0; i < Size; ++i) {
12569 if (Mask[i] < 0)
12570 continue;
12571
12572 int j = i / LaneSize;
12573
12574 if (Lanes[j] < 0) {
12575 // First entry we've seen for this lane.
12576 Lanes[j] = Mask[i] / LaneSize;
12577 } else if (Lanes[j] != Mask[i] / LaneSize) {
12578 // This doesn't match the lane selected previously!
12579 return SDValue();
12580 }
12581
12582 // Check that within each lane we have a consistent shuffle mask.
12583 int k = i % LaneSize;
12584 if (InLaneMask[k] < 0) {
12585 InLaneMask[k] = Mask[i] % LaneSize;
12586 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12587 // This doesn't fit a repeating in-lane mask.
12588 return SDValue();
12589 }
12590 }
12591
12592 // First shuffle the lanes into place.
12593 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12594 VT.getSizeInBits() / 64);
12595 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12596 for (int i = 0; i < NumLanes; ++i)
12597 if (Lanes[i] >= 0) {
12598 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12599 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12600 }
12601
12602 V1 = DAG.getBitcast(LaneVT, V1);
12603 V2 = DAG.getBitcast(LaneVT, V2);
12604 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12605
12606 // Cast it back to the type we actually want.
12607 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12608
12609 // Now do a simple shuffle that isn't lane crossing.
12610 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12611 for (int i = 0; i < Size; ++i)
12612 if (Mask[i] >= 0)
12613 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12614 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, NewMask) && "Must not introduce lane crosses at this point!"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12615, __extension__ __PRETTY_FUNCTION__))
12615 "Must not introduce lane crosses at this point!")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, NewMask) && "Must not introduce lane crosses at this point!"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, NewMask) && \"Must not introduce lane crosses at this point!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12615, __extension__ __PRETTY_FUNCTION__))
;
12616
12617 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12618}
12619
12620/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12621/// This allows for fast cases such as subvector extraction/insertion
12622/// or shuffling smaller vector types which can lower more efficiently.
12623static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12624 SDValue V1, SDValue V2,
12625 ArrayRef<int> Mask,
12626 const X86Subtarget &Subtarget,
12627 SelectionDAG &DAG) {
12628 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12629, __extension__ __PRETTY_FUNCTION__))
12629 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12629, __extension__ __PRETTY_FUNCTION__))
;
12630
12631 unsigned NumElts = VT.getVectorNumElements();
12632 unsigned HalfNumElts = NumElts / 2;
12633 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12634
12635 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12636 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12637 if (!UndefLower && !UndefUpper)
12638 return SDValue();
12639
12640 // Upper half is undef and lower half is whole upper subvector.
12641 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12642 if (UndefUpper &&
12643 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12644 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12645 DAG.getIntPtrConstant(HalfNumElts, DL));
12646 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12647 DAG.getIntPtrConstant(0, DL));
12648 }
12649
12650 // Lower half is undef and upper half is whole lower subvector.
12651 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12652 if (UndefLower &&
12653 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12654 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12655 DAG.getIntPtrConstant(0, DL));
12656 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12657 DAG.getIntPtrConstant(HalfNumElts, DL));
12658 }
12659
12660 // If the shuffle only uses two of the four halves of the input operands,
12661 // then extract them and perform the 'half' shuffle at half width.
12662 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12663 int HalfIdx1 = -1, HalfIdx2 = -1;
12664 SmallVector<int, 8> HalfMask(HalfNumElts);
12665 unsigned Offset = UndefLower ? HalfNumElts : 0;
12666 for (unsigned i = 0; i != HalfNumElts; ++i) {
12667 int M = Mask[i + Offset];
12668 if (M < 0) {
12669 HalfMask[i] = M;
12670 continue;
12671 }
12672
12673 // Determine which of the 4 half vectors this element is from.
12674 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12675 int HalfIdx = M / HalfNumElts;
12676
12677 // Determine the element index into its half vector source.
12678 int HalfElt = M % HalfNumElts;
12679
12680 // We can shuffle with up to 2 half vectors, set the new 'half'
12681 // shuffle mask accordingly.
12682 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12683 HalfMask[i] = HalfElt;
12684 HalfIdx1 = HalfIdx;
12685 continue;
12686 }
12687 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12688 HalfMask[i] = HalfElt + HalfNumElts;
12689 HalfIdx2 = HalfIdx;
12690 continue;
12691 }
12692
12693 // Too many half vectors referenced.
12694 return SDValue();
12695 }
12696 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12696, __extension__ __PRETTY_FUNCTION__))
;
12697
12698 // Only shuffle the halves of the inputs when useful.
12699 int NumLowerHalves =
12700 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12701 int NumUpperHalves =
12702 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12703
12704 // uuuuXXXX - don't extract uppers just to insert again.
12705 if (UndefLower && NumUpperHalves != 0)
12706 return SDValue();
12707
12708 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12709 if (UndefUpper && NumUpperHalves == 2)
12710 return SDValue();
12711
12712 // AVX2 - XXXXuuuu - always extract lowers.
12713 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12714 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12715 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12716 return SDValue();
12717 // AVX2 supports variable 32-bit element cross-lane shuffles.
12718 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12719 // XXXXuuuu - don't extract lowers and uppers.
12720 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12721 return SDValue();
12722 }
12723 }
12724
12725 // AVX512 - XXXXuuuu - always extract lowers.
12726 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12727 return SDValue();
12728
12729 auto GetHalfVector = [&](int HalfIdx) {
12730 if (HalfIdx < 0)
12731 return DAG.getUNDEF(HalfVT);
12732 SDValue V = (HalfIdx < 2 ? V1 : V2);
12733 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12734 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12735 DAG.getIntPtrConstant(HalfIdx, DL));
12736 };
12737
12738 SDValue Half1 = GetHalfVector(HalfIdx1);
12739 SDValue Half2 = GetHalfVector(HalfIdx2);
12740 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12741 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12742 DAG.getIntPtrConstant(Offset, DL));
12743}
12744
12745/// \brief Test whether the specified input (0 or 1) is in-place blended by the
12746/// given mask.
12747///
12748/// This returns true if the elements from a particular input are already in the
12749/// slot required by the given mask and require no permutation.
12750static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12751 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12751, __extension__ __PRETTY_FUNCTION__))
;
12752 int Size = Mask.size();
12753 for (int i = 0; i < Size; ++i)
12754 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12755 return false;
12756
12757 return true;
12758}
12759
12760/// Handle case where shuffle sources are coming from the same 128-bit lane and
12761/// every lane can be represented as the same repeating mask - allowing us to
12762/// shuffle the sources with the repeating shuffle and then permute the result
12763/// to the destination lanes.
12764static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12765 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12766 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12767 int NumElts = VT.getVectorNumElements();
12768 int NumLanes = VT.getSizeInBits() / 128;
12769 int NumLaneElts = NumElts / NumLanes;
12770
12771 // On AVX2 we may be able to just shuffle the lowest elements and then
12772 // broadcast the result.
12773 if (Subtarget.hasAVX2()) {
12774 for (unsigned BroadcastSize : {16, 32, 64}) {
12775 if (BroadcastSize <= VT.getScalarSizeInBits())
12776 continue;
12777 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12778
12779 // Attempt to match a repeating pattern every NumBroadcastElts,
12780 // accounting for UNDEFs but only references the lowest 128-bit
12781 // lane of the inputs.
12782 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12783 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12784 for (int j = 0; j != NumBroadcastElts; ++j) {
12785 int M = Mask[i + j];
12786 if (M < 0)
12787 continue;
12788 int &R = RepeatMask[j];
12789 if (0 != ((M % NumElts) / NumLaneElts))
12790 return false;
12791 if (0 <= R && R != M)
12792 return false;
12793 R = M;
12794 }
12795 return true;
12796 };
12797
12798 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12799 if (!FindRepeatingBroadcastMask(RepeatMask))
12800 continue;
12801
12802 // Shuffle the (lowest) repeated elements in place for broadcast.
12803 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12804
12805 // Shuffle the actual broadcast.
12806 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12807 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12808 for (int j = 0; j != NumBroadcastElts; ++j)
12809 BroadcastMask[i + j] = j;
12810 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12811 BroadcastMask);
12812 }
12813 }
12814
12815 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12816 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12817 return SDValue();
12818
12819 // Bail if we already have a repeated lane shuffle mask.
12820 SmallVector<int, 8> RepeatedShuffleMask;
12821 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12822 return SDValue();
12823
12824 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12825 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12826 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12827 int NumSubLanes = NumLanes * SubLaneScale;
12828 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12829
12830 // Check that all the sources are coming from the same lane and see if we can
12831 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12832 // determine the source sub-lane for each destination sub-lane.
12833 int TopSrcSubLane = -1;
12834 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12835 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12836 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12837 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12838
12839 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12840 // Extract the sub-lane mask, check that it all comes from the same lane
12841 // and normalize the mask entries to come from the first lane.
12842 int SrcLane = -1;
12843 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12844 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12845 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12846 if (M < 0)
12847 continue;
12848 int Lane = (M % NumElts) / NumLaneElts;
12849 if ((0 <= SrcLane) && (SrcLane != Lane))
12850 return SDValue();
12851 SrcLane = Lane;
12852 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12853 SubLaneMask[Elt] = LocalM;
12854 }
12855
12856 // Whole sub-lane is UNDEF.
12857 if (SrcLane < 0)
12858 continue;
12859
12860 // Attempt to match against the candidate repeated sub-lane masks.
12861 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12862 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12863 for (int i = 0; i != NumSubLaneElts; ++i) {
12864 if (M1[i] < 0 || M2[i] < 0)
12865 continue;
12866 if (M1[i] != M2[i])
12867 return false;
12868 }
12869 return true;
12870 };
12871
12872 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12873 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12874 continue;
12875
12876 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12877 for (int i = 0; i != NumSubLaneElts; ++i) {
12878 int M = SubLaneMask[i];
12879 if (M < 0)
12880 continue;
12881 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12882, __extension__ __PRETTY_FUNCTION__))
12882 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12882, __extension__ __PRETTY_FUNCTION__))
;
12883 RepeatedSubLaneMask[i] = M;
12884 }
12885
12886 // Track the top most source sub-lane - by setting the remaining to UNDEF
12887 // we can greatly simplify shuffle matching.
12888 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12889 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12890 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12891 break;
12892 }
12893
12894 // Bail if we failed to find a matching repeated sub-lane mask.
12895 if (Dst2SrcSubLanes[DstSubLane] < 0)
12896 return SDValue();
12897 }
12898 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12899, __extension__ __PRETTY_FUNCTION__))
12899 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12899, __extension__ __PRETTY_FUNCTION__))
;
12900
12901 // Create a repeating shuffle mask for the entire vector.
12902 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12903 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12904 int Lane = SubLane / SubLaneScale;
12905 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12906 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12907 int M = RepeatedSubLaneMask[Elt];
12908 if (M < 0)
12909 continue;
12910 int Idx = (SubLane * NumSubLaneElts) + Elt;
12911 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12912 }
12913 }
12914 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12915
12916 // Shuffle each source sub-lane to its destination.
12917 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12918 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12919 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12920 if (SrcSubLane < 0)
12921 continue;
12922 for (int j = 0; j != NumSubLaneElts; ++j)
12923 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12924 }
12925
12926 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12927 SubLaneMask);
12928}
12929
12930static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12931 unsigned &ShuffleImm,
12932 ArrayRef<int> Mask) {
12933 int NumElts = VT.getVectorNumElements();
12934 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__))
12935 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__))
12936 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12936, __extension__ __PRETTY_FUNCTION__))
;
12937
12938 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12939 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12940 ShuffleImm = 0;
12941 bool ShufpdMask = true;
12942 bool CommutableMask = true;
12943 for (int i = 0; i < NumElts; ++i) {
12944 if (Mask[i] == SM_SentinelUndef)
12945 continue;
12946 if (Mask[i] < 0)
12947 return false;
12948 int Val = (i & 6) + NumElts * (i & 1);
12949 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12950 if (Mask[i] < Val || Mask[i] > Val + 1)
12951 ShufpdMask = false;
12952 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12953 CommutableMask = false;
12954 ShuffleImm |= (Mask[i] % 2) << i;
12955 }
12956
12957 if (ShufpdMask)
12958 return true;
12959 if (CommutableMask) {
12960 std::swap(V1, V2);
12961 return true;
12962 }
12963
12964 return false;
12965}
12966
12967static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12968 ArrayRef<int> Mask, SDValue V1,
12969 SDValue V2, SelectionDAG &DAG) {
12970 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12971, __extension__ __PRETTY_FUNCTION__))
12971 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 12971, __extension__ __PRETTY_FUNCTION__))
;
12972
12973 unsigned Immediate = 0;
12974 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12975 return SDValue();
12976
12977 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12978 DAG.getConstant(Immediate, DL, MVT::i8));
12979}
12980
12981static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12982 ArrayRef<int> Mask, SDValue V1,
12983 SDValue V2, SelectionDAG &DAG) {
12984 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12985 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12986
12987 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12988 if (V2.isUndef())
12989 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12990
12991 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12992}
12993
12994/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12995///
12996/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12997/// isn't available.
12998static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12999 const APInt &Zeroable,
13000 SDValue V1, SDValue V2,
13001 const X86Subtarget &Subtarget,
13002 SelectionDAG &DAG) {
13003 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13003, __extension__ __PRETTY_FUNCTION__))
;
13004 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13004, __extension__ __PRETTY_FUNCTION__))
;
13005 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13005, __extension__ __PRETTY_FUNCTION__))
;
13006
13007 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13008 Zeroable, Subtarget, DAG))
13009 return V;
13010
13011 if (V2.isUndef()) {
13012 // Check for being able to broadcast a single element.
13013 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13014 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13015 return Broadcast;
13016
13017 // Use low duplicate instructions for masks that match their pattern.
13018 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13019 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13020
13021 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13022 // Non-half-crossing single input shuffles can be lowered with an
13023 // interleaved permutation.
13024 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13025 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13026 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13027 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13028 }
13029
13030 // With AVX2 we have direct support for this permutation.
13031 if (Subtarget.hasAVX2())
13032 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13033 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13034
13035 // Try to create an in-lane repeating shuffle mask and then shuffle the
13036 // the results into the target lanes.
13037 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13038 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13039 return V;
13040
13041 // Otherwise, fall back.
13042 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13043 DAG, Subtarget);
13044 }
13045
13046 // Use dedicated unpack instructions for masks that match their pattern.
13047 if (SDValue V =
13048 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13049 return V;
13050
13051 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13052 Zeroable, Subtarget, DAG))
13053 return Blend;
13054
13055 // Check if the blend happens to exactly fit that of SHUFPD.
13056 if (SDValue Op =
13057 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13058 return Op;
13059
13060 // Try to create an in-lane repeating shuffle mask and then shuffle the
13061 // the results into the target lanes.
13062 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13063 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13064 return V;
13065
13066 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13067 // shuffle. However, if we have AVX2 and either inputs are already in place,
13068 // we will be able to shuffle even across lanes the other input in a single
13069 // instruction so skip this pattern.
13070 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13071 isShuffleMaskInputInPlace(1, Mask))))
13072 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13073 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13074 return Result;
13075 // If we have VLX support, we can use VEXPAND.
13076 if (Subtarget.hasVLX())
13077 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13078 V1, V2, DAG, Subtarget))
13079 return V;
13080
13081 // If we have AVX2 then we always want to lower with a blend because an v4 we
13082 // can fully permute the elements.
13083 if (Subtarget.hasAVX2())
13084 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13085 Mask, DAG);
13086
13087 // Otherwise fall back on generic lowering.
13088 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13089}
13090
13091/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13092///
13093/// This routine is only called when we have AVX2 and thus a reasonable
13094/// instruction set for v4i64 shuffling..
13095static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13096 const APInt &Zeroable,
13097 SDValue V1, SDValue V2,
13098 const X86Subtarget &Subtarget,
13099 SelectionDAG &DAG) {
13100 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13100, __extension__ __PRETTY_FUNCTION__))
;
13101 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13101, __extension__ __PRETTY_FUNCTION__))
;
13102 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13102, __extension__ __PRETTY_FUNCTION__))
;
13103 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13103, __extension__ __PRETTY_FUNCTION__))
;
13104
13105 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13106 Zeroable, Subtarget, DAG))
13107 return V;
13108
13109 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13110 Zeroable, Subtarget, DAG))
13111 return Blend;
13112
13113 // Check for being able to broadcast a single element.
13114 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13115 Mask, Subtarget, DAG))
13116 return Broadcast;
13117
13118 if (V2.isUndef()) {
13119 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13120 // can use lower latency instructions that will operate on both lanes.
13121 SmallVector<int, 2> RepeatedMask;
13122 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13123 SmallVector<int, 4> PSHUFDMask;
13124 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13125 return DAG.getBitcast(
13126 MVT::v4i64,
13127 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13128 DAG.getBitcast(MVT::v8i32, V1),
13129 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13130 }
13131
13132 // AVX2 provides a direct instruction for permuting a single input across
13133 // lanes.
13134 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13135 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13136 }
13137
13138 // Try to use shift instructions.
13139 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13140 Zeroable, Subtarget, DAG))
13141 return Shift;
13142
13143 // If we have VLX support, we can use VALIGN or VEXPAND.
13144 if (Subtarget.hasVLX()) {
13145 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13146 Mask, Subtarget, DAG))
13147 return Rotate;
13148
13149 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13150 V1, V2, DAG, Subtarget))
13151 return V;
13152 }
13153
13154 // Try to use PALIGNR.
13155 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13156 Mask, Subtarget, DAG))
13157 return Rotate;
13158
13159 // Use dedicated unpack instructions for masks that match their pattern.
13160 if (SDValue V =
13161 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13162 return V;
13163
13164 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13165 // shuffle. However, if we have AVX2 and either inputs are already in place,
13166 // we will be able to shuffle even across lanes the other input in a single
13167 // instruction so skip this pattern.
13168 if (!isShuffleMaskInputInPlace(0, Mask) &&
13169 !isShuffleMaskInputInPlace(1, Mask))
13170 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13171 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13172 return Result;
13173
13174 // Otherwise fall back on generic blend lowering.
13175 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13176 Mask, DAG);
13177}
13178
13179/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13180///
13181/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13182/// isn't available.
13183static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13184 const APInt &Zeroable,
13185 SDValue V1, SDValue V2,
13186 const X86Subtarget &Subtarget,
13187 SelectionDAG &DAG) {
13188 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13188, __extension__ __PRETTY_FUNCTION__))
;
13189 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13189, __extension__ __PRETTY_FUNCTION__))
;
13190 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13190, __extension__ __PRETTY_FUNCTION__))
;
13191
13192 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13193 Zeroable, Subtarget, DAG))
13194 return Blend;
13195
13196 // Check for being able to broadcast a single element.
13197 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13198 Mask, Subtarget, DAG))
13199 return Broadcast;
13200
13201 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13202 // options to efficiently lower the shuffle.
13203 SmallVector<int, 4> RepeatedMask;
13204 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13205 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13206, __extension__ __PRETTY_FUNCTION__))
13206 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13206, __extension__ __PRETTY_FUNCTION__))
;
13207
13208 // Use even/odd duplicate instructions for masks that match their pattern.
13209 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13210 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13211 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13212 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13213
13214 if (V2.isUndef())
13215 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13216 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13217
13218 // Use dedicated unpack instructions for masks that match their pattern.
13219 if (SDValue V =
13220 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13221 return V;
13222
13223 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13224 // have already handled any direct blends.
13225 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13226 }
13227
13228 // Try to create an in-lane repeating shuffle mask and then shuffle the
13229 // the results into the target lanes.
13230 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13231 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13232 return V;
13233
13234 // If we have a single input shuffle with different shuffle patterns in the
13235 // two 128-bit lanes use the variable mask to VPERMILPS.
13236 if (V2.isUndef()) {
13237 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13238 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13239 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13240
13241 if (Subtarget.hasAVX2())
13242 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13243
13244 // Otherwise, fall back.
13245 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13246 DAG, Subtarget);
13247 }
13248
13249 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13250 // shuffle.
13251 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13252 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13253 return Result;
13254 // If we have VLX support, we can use VEXPAND.
13255 if (Subtarget.hasVLX())
13256 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13257 V1, V2, DAG, Subtarget))
13258 return V;
13259
13260 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13261 // since after split we get a more efficient code using vpunpcklwd and
13262 // vpunpckhwd instrs than vblend.
13263 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13264 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13265 Mask, DAG))
13266 return V;
13267
13268 // If we have AVX2 then we always want to lower with a blend because at v8 we
13269 // can fully permute the elements.
13270 if (Subtarget.hasAVX2())
13271 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13272 Mask, DAG);
13273
13274 // Otherwise fall back on generic lowering.
13275 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13276}
13277
13278/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13279///
13280/// This routine is only called when we have AVX2 and thus a reasonable
13281/// instruction set for v8i32 shuffling..
13282static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13283 const APInt &Zeroable,
13284 SDValue V1, SDValue V2,
13285 const X86Subtarget &Subtarget,
13286 SelectionDAG &DAG) {
13287 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13287, __extension__ __PRETTY_FUNCTION__))
;
13288 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13288, __extension__ __PRETTY_FUNCTION__))
;
13289 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13289, __extension__ __PRETTY_FUNCTION__))
;
13290 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13290, __extension__ __PRETTY_FUNCTION__))
;
13291
13292 // Whenever we can lower this as a zext, that instruction is strictly faster
13293 // than any alternative. It also allows us to fold memory operands into the
13294 // shuffle in many cases.
13295 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13296 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13297 return ZExt;
13298
13299 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13300 // since after split we get a more efficient code than vblend by using
13301 // vpunpcklwd and vpunpckhwd instrs.
13302 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13303 !Subtarget.hasAVX512())
13304 if (SDValue V =
13305 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13306 return V;
13307
13308 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13309 Zeroable, Subtarget, DAG))
13310 return Blend;
13311
13312 // Check for being able to broadcast a single element.
13313 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13314 Mask, Subtarget, DAG))
13315 return Broadcast;
13316
13317 // If the shuffle mask is repeated in each 128-bit lane we can use more
13318 // efficient instructions that mirror the shuffles across the two 128-bit
13319 // lanes.
13320 SmallVector<int, 4> RepeatedMask;
13321 bool Is128BitLaneRepeatedShuffle =
13322 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13323 if (Is128BitLaneRepeatedShuffle) {
13324 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13324, __extension__ __PRETTY_FUNCTION__))
;
13325 if (V2.isUndef())
13326 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13327 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13328
13329 // Use dedicated unpack instructions for masks that match their pattern.
13330 if (SDValue V =
13331 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13332 return V;
13333 }
13334
13335 // Try to use shift instructions.
13336 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13337 Zeroable, Subtarget, DAG))
13338 return Shift;
13339
13340 // If we have VLX support, we can use VALIGN or EXPAND.
13341 if (Subtarget.hasVLX()) {
13342 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13343 Mask, Subtarget, DAG))
13344 return Rotate;
13345
13346 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13347 V1, V2, DAG, Subtarget))
13348 return V;
13349 }
13350
13351 // Try to use byte rotation instructions.
13352 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13353 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13354 return Rotate;
13355
13356 // Try to create an in-lane repeating shuffle mask and then shuffle the
13357 // results into the target lanes.
13358 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13359 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13360 return V;
13361
13362 // If the shuffle patterns aren't repeated but it is a single input, directly
13363 // generate a cross-lane VPERMD instruction.
13364 if (V2.isUndef()) {
13365 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13366 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13367 }
13368
13369 // Assume that a single SHUFPS is faster than an alternative sequence of
13370 // multiple instructions (even if the CPU has a domain penalty).
13371 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13372 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13373 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13374 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13375 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13376 CastV1, CastV2, DAG);
13377 return DAG.getBitcast(MVT::v8i32, ShufPS);
13378 }
13379
13380 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13381 // shuffle.
13382 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13383 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13384 return Result;
13385
13386 // Otherwise fall back on generic blend lowering.
13387 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13388 Mask, DAG);
13389}
13390
13391/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13392///
13393/// This routine is only called when we have AVX2 and thus a reasonable
13394/// instruction set for v16i16 shuffling..
13395static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13396 const APInt &Zeroable,
13397 SDValue V1, SDValue V2,
13398 const X86Subtarget &Subtarget,
13399 SelectionDAG &DAG) {
13400 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13400, __extension__ __PRETTY_FUNCTION__))
;
13401 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13401, __extension__ __PRETTY_FUNCTION__))
;
13402 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13402, __extension__ __PRETTY_FUNCTION__))
;
13403 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13403, __extension__ __PRETTY_FUNCTION__))
;
13404
13405 // Whenever we can lower this as a zext, that instruction is strictly faster
13406 // than any alternative. It also allows us to fold memory operands into the
13407 // shuffle in many cases.
13408 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13409 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13410 return ZExt;
13411
13412 // Check for being able to broadcast a single element.
13413 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13414 Mask, Subtarget, DAG))
13415 return Broadcast;
13416
13417 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13418 Zeroable, Subtarget, DAG))
13419 return Blend;
13420
13421 // Use dedicated unpack instructions for masks that match their pattern.
13422 if (SDValue V =
13423 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13424 return V;
13425
13426 // Use dedicated pack instructions for masks that match their pattern.
13427 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13428 Subtarget))
13429 return V;
13430
13431 // Try to use shift instructions.
13432 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13433 Zeroable, Subtarget, DAG))
13434 return Shift;
13435
13436 // Try to use byte rotation instructions.
13437 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13438 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13439 return Rotate;
13440
13441 // Try to create an in-lane repeating shuffle mask and then shuffle the
13442 // the results into the target lanes.
13443 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13444 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13445 return V;
13446
13447 if (V2.isUndef()) {
13448 // There are no generalized cross-lane shuffle operations available on i16
13449 // element types.
13450 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13451 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13452 Mask, DAG, Subtarget);
13453
13454 SmallVector<int, 8> RepeatedMask;
13455 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13456 // As this is a single-input shuffle, the repeated mask should be
13457 // a strictly valid v8i16 mask that we can pass through to the v8i16
13458 // lowering to handle even the v16 case.
13459 return lowerV8I16GeneralSingleInputVectorShuffle(
13460 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13461 }
13462 }
13463
13464 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13465 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13466 return PSHUFB;
13467
13468 // AVX512BWVL can lower to VPERMW.
13469 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13470 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13471
13472 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13473 // shuffle.
13474 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13475 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13476 return Result;
13477
13478 // Otherwise fall back on generic lowering.
13479 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13480}
13481
13482/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13483///
13484/// This routine is only called when we have AVX2 and thus a reasonable
13485/// instruction set for v32i8 shuffling..
13486static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13487 const APInt &Zeroable,
13488 SDValue V1, SDValue V2,
13489 const X86Subtarget &Subtarget,
13490 SelectionDAG &DAG) {
13491 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13491, __extension__ __PRETTY_FUNCTION__))
;
13492 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13492, __extension__ __PRETTY_FUNCTION__))
;
13493 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13493, __extension__ __PRETTY_FUNCTION__))
;
13494 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13494, __extension__ __PRETTY_FUNCTION__))
;
13495
13496 // Whenever we can lower this as a zext, that instruction is strictly faster
13497 // than any alternative. It also allows us to fold memory operands into the
13498 // shuffle in many cases.
13499 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13500 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13501 return ZExt;
13502
13503 // Check for being able to broadcast a single element.
13504 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13505 Mask, Subtarget, DAG))
13506 return Broadcast;
13507
13508 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13509 Zeroable, Subtarget, DAG))
13510 return Blend;
13511
13512 // Use dedicated unpack instructions for masks that match their pattern.
13513 if (SDValue V =
13514 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13515 return V;
13516
13517 // Use dedicated pack instructions for masks that match their pattern.
13518 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13519 Subtarget))
13520 return V;
13521
13522 // Try to use shift instructions.
13523 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13524 Zeroable, Subtarget, DAG))
13525 return Shift;
13526
13527 // Try to use byte rotation instructions.
13528 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13529 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13530 return Rotate;
13531
13532 // Try to create an in-lane repeating shuffle mask and then shuffle the
13533 // the results into the target lanes.
13534 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13535 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13536 return V;
13537
13538 // There are no generalized cross-lane shuffle operations available on i8
13539 // element types.
13540 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13541 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13542 DAG, Subtarget);
13543
13544 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13545 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13546 return PSHUFB;
13547
13548 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13549 // shuffle.
13550 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13551 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13552 return Result;
13553
13554 // Otherwise fall back on generic lowering.
13555 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13556}
13557
13558/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13559///
13560/// This routine either breaks down the specific type of a 256-bit x86 vector
13561/// shuffle or splits it into two 128-bit shuffles and fuses the results back
13562/// together based on the available instructions.
13563static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13564 MVT VT, SDValue V1, SDValue V2,
13565 const APInt &Zeroable,
13566 const X86Subtarget &Subtarget,
13567 SelectionDAG &DAG) {
13568 // If we have a single input to the zero element, insert that into V1 if we
13569 // can do so cheaply.
13570 int NumElts = VT.getVectorNumElements();
13571 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13572
13573 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13574 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13575 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13576 return Insertion;
13577
13578 // Handle special cases where the lower or upper half is UNDEF.
13579 if (SDValue V =
13580 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13581 return V;
13582
13583 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13584 // can check for those subtargets here and avoid much of the subtarget
13585 // querying in the per-vector-type lowering routines. With AVX1 we have
13586 // essentially *zero* ability to manipulate a 256-bit vector with integer
13587 // types. Since we'll use floating point types there eventually, just
13588 // immediately cast everything to a float and operate entirely in that domain.
13589 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13590 int ElementBits = VT.getScalarSizeInBits();
13591 if (ElementBits < 32) {
13592 // No floating point type available, if we can't use the bit operations
13593 // for masking/blending then decompose into 128-bit vectors.
13594 if (SDValue V =
13595 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13596 return V;
13597 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13598 return V;
13599 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13600 }
13601
13602 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13603 VT.getVectorNumElements());
13604 V1 = DAG.getBitcast(FpVT, V1);
13605 V2 = DAG.getBitcast(FpVT, V2);
13606 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13607 }
13608
13609 switch (VT.SimpleTy) {
13610 case MVT::v4f64:
13611 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13612 case MVT::v4i64:
13613 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13614 case MVT::v8f32:
13615 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13616 case MVT::v8i32:
13617 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13618 case MVT::v16i16:
13619 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13620 case MVT::v32i8:
13621 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13622
13623 default:
13624 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13624)
;
13625 }
13626}
13627
13628/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13629static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13630 ArrayRef<int> Mask, SDValue V1,
13631 SDValue V2, SelectionDAG &DAG) {
13632 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13633, __extension__ __PRETTY_FUNCTION__))
13633 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13633, __extension__ __PRETTY_FUNCTION__))
;
13634
13635 // To handle 256 bit vector requires VLX and most probably
13636 // function lowerV2X128VectorShuffle() is better solution.
13637 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13637, __extension__ __PRETTY_FUNCTION__))
;
13638
13639 SmallVector<int, 4> WidenedMask;
13640 if (!canWidenShuffleElements(Mask, WidenedMask))
13641 return SDValue();
13642
13643 // Check for patterns which can be matched with a single insert of a 256-bit
13644 // subvector.
13645 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13646 {0, 1, 2, 3, 0, 1, 2, 3});
13647 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13648 {0, 1, 2, 3, 8, 9, 10, 11})) {
13649 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13650 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13651 DAG.getIntPtrConstant(0, DL));
13652 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13653 OnlyUsesV1 ? V1 : V2,
13654 DAG.getIntPtrConstant(0, DL));
13655 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13656 }
13657
13658 assert(WidenedMask.size() == 4)(static_cast <bool> (WidenedMask.size() == 4) ? void (0
) : __assert_fail ("WidenedMask.size() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13658, __extension__ __PRETTY_FUNCTION__))
;
13659
13660 // See if this is an insertion of the lower 128-bits of V2 into V1.
13661 bool IsInsert = true;
13662 int V2Index = -1;
13663 for (int i = 0; i < 4; ++i) {
13664 assert(WidenedMask[i] >= -1)(static_cast <bool> (WidenedMask[i] >= -1) ? void (0
) : __assert_fail ("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13664, __extension__ __PRETTY_FUNCTION__))
;
13665 if (WidenedMask[i] < 0)
13666 continue;
13667
13668 // Make sure all V1 subvectors are in place.
13669 if (WidenedMask[i] < 4) {
13670 if (WidenedMask[i] != i) {
13671 IsInsert = false;
13672 break;
13673 }
13674 } else {
13675 // Make sure we only have a single V2 index and its the lowest 128-bits.
13676 if (V2Index >= 0 || WidenedMask[i] != 4) {
13677 IsInsert = false;
13678 break;
13679 }
13680 V2Index = i;
13681 }
13682 }
13683 if (IsInsert && V2Index >= 0) {
13684 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13685 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13686 DAG.getIntPtrConstant(0, DL));
13687 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13688 }
13689
13690 // Try to lower to to vshuf64x2/vshuf32x4.
13691 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13692 unsigned PermMask = 0;
13693 // Insure elements came from the same Op.
13694 for (int i = 0; i < 4; ++i) {
13695 assert(WidenedMask[i] >= -1)(static_cast <bool> (WidenedMask[i] >= -1) ? void (0
) : __assert_fail ("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13695, __extension__ __PRETTY_FUNCTION__))
;
13696 if (WidenedMask[i] < 0)
13697 continue;
13698
13699 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13700 unsigned OpIndex = i / 2;
13701 if (Ops[OpIndex].isUndef())
13702 Ops[OpIndex] = Op;
13703 else if (Ops[OpIndex] != Op)
13704 return SDValue();
13705
13706 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13707 // bits defined by a vshuf64x2 instruction's immediate control byte.
13708 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13709 }
13710
13711 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13712 DAG.getConstant(PermMask, DL, MVT::i8));
13713}
13714
13715/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13716static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13717 const APInt &Zeroable,
13718 SDValue V1, SDValue V2,
13719 const X86Subtarget &Subtarget,
13720 SelectionDAG &DAG) {
13721 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13721, __extension__ __PRETTY_FUNCTION__))
;
13722 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13722, __extension__ __PRETTY_FUNCTION__))
;
13723 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13723, __extension__ __PRETTY_FUNCTION__))
;
13724
13725 if (V2.isUndef()) {
13726 // Use low duplicate instructions for masks that match their pattern.
13727 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13728 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13729
13730 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13731 // Non-half-crossing single input shuffles can be lowered with an
13732 // interleaved permutation.
13733 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13734 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13735 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13736 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13737 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13738 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13739 }
13740
13741 SmallVector<int, 4> RepeatedMask;
13742 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13743 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13744 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13745 }
13746
13747 if (SDValue Shuf128 =
13748 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13749 return Shuf128;
13750
13751 if (SDValue Unpck =
13752 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13753 return Unpck;
13754
13755 // Check if the blend happens to exactly fit that of SHUFPD.
13756 if (SDValue Op =
13757 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13758 return Op;
13759
13760 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13761 V2, DAG, Subtarget))
13762 return V;
13763
13764 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13765 Zeroable, Subtarget, DAG))
13766 return Blend;
13767
13768 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13769}
13770
13771/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13772static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13773 const APInt &Zeroable,
13774 SDValue V1, SDValue V2,
13775 const X86Subtarget &Subtarget,
13776 SelectionDAG &DAG) {
13777 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13777, __extension__ __PRETTY_FUNCTION__))
;
13778 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13778, __extension__ __PRETTY_FUNCTION__))
;
13779 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13779, __extension__ __PRETTY_FUNCTION__))
;
13780
13781 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13782 // options to efficiently lower the shuffle.
13783 SmallVector<int, 4> RepeatedMask;
13784 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13785 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13785, __extension__ __PRETTY_FUNCTION__))
;
13786
13787 // Use even/odd duplicate instructions for masks that match their pattern.
13788 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13789 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13790 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13791 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13792
13793 if (V2.isUndef())
13794 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13795 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13796
13797 // Use dedicated unpack instructions for masks that match their pattern.
13798 if (SDValue Unpck =
13799 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13800 return Unpck;
13801
13802 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13803 Zeroable, Subtarget, DAG))
13804 return Blend;
13805
13806 // Otherwise, fall back to a SHUFPS sequence.
13807 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13808 }
13809
13810 // If we have a single input shuffle with different shuffle patterns in the
13811 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13812 if (V2.isUndef() &&
13813 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
13814 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13815 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13816 }
13817
13818 // If we have AVX512F support, we can use VEXPAND.
13819 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13820 V1, V2, DAG, Subtarget))
13821 return V;
13822
13823 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13824}
13825
13826/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13827static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13828 const APInt &Zeroable,
13829 SDValue V1, SDValue V2,
13830 const X86Subtarget &Subtarget,
13831 SelectionDAG &DAG) {
13832 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13832, __extension__ __PRETTY_FUNCTION__))
;
13833 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13833, __extension__ __PRETTY_FUNCTION__))
;
13834 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13834, __extension__ __PRETTY_FUNCTION__))
;
13835
13836 if (V2.isUndef()) {
13837 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13838 // can use lower latency instructions that will operate on all four
13839 // 128-bit lanes.
13840 SmallVector<int, 2> Repeated128Mask;
13841 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13842 SmallVector<int, 4> PSHUFDMask;
13843 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13844 return DAG.getBitcast(
13845 MVT::v8i64,
13846 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13847 DAG.getBitcast(MVT::v16i32, V1),
13848 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13849 }
13850
13851 SmallVector<int, 4> Repeated256Mask;
13852 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13853 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13854 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13855 }
13856
13857 if (SDValue Shuf128 =
13858 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13859 return Shuf128;
13860
13861 // Try to use shift instructions.
13862 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13863 Zeroable, Subtarget, DAG))
13864 return Shift;
13865
13866 // Try to use VALIGN.
13867 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13868 Mask, Subtarget, DAG))
13869 return Rotate;
13870
13871 // Try to use PALIGNR.
13872 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13873 Mask, Subtarget, DAG))
13874 return Rotate;
13875
13876 if (SDValue Unpck =
13877 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13878 return Unpck;
13879 // If we have AVX512F support, we can use VEXPAND.
13880 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13881 V2, DAG, Subtarget))
13882 return V;
13883
13884 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13885 Zeroable, Subtarget, DAG))
13886 return Blend;
13887
13888 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13889}
13890
13891/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13892static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13893 const APInt &Zeroable,
13894 SDValue V1, SDValue V2,
13895 const X86Subtarget &Subtarget,
13896 SelectionDAG &DAG) {
13897 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13897, __extension__ __PRETTY_FUNCTION__))
;
13898 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13898, __extension__ __PRETTY_FUNCTION__))
;
13899 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13899, __extension__ __PRETTY_FUNCTION__))
;
13900
13901 // Whenever we can lower this as a zext, that instruction is strictly faster
13902 // than any alternative. It also allows us to fold memory operands into the
13903 // shuffle in many cases.
13904 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13905 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13906 return ZExt;
13907
13908 // If the shuffle mask is repeated in each 128-bit lane we can use more
13909 // efficient instructions that mirror the shuffles across the four 128-bit
13910 // lanes.
13911 SmallVector<int, 4> RepeatedMask;
13912 bool Is128BitLaneRepeatedShuffle =
13913 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13914 if (Is128BitLaneRepeatedShuffle) {
13915 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13915, __extension__ __PRETTY_FUNCTION__))
;
13916 if (V2.isUndef())
13917 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13918 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13919
13920 // Use dedicated unpack instructions for masks that match their pattern.
13921 if (SDValue V =
13922 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13923 return V;
13924 }
13925
13926 // Try to use shift instructions.
13927 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13928 Zeroable, Subtarget, DAG))
13929 return Shift;
13930
13931 // Try to use VALIGN.
13932 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13933 Mask, Subtarget, DAG))
13934 return Rotate;
13935
13936 // Try to use byte rotation instructions.
13937 if (Subtarget.hasBWI())
13938 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13939 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13940 return Rotate;
13941
13942 // Assume that a single SHUFPS is faster than using a permv shuffle.
13943 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13944 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13945 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13946 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13947 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13948 CastV1, CastV2, DAG);
13949 return DAG.getBitcast(MVT::v16i32, ShufPS);
13950 }
13951 // If we have AVX512F support, we can use VEXPAND.
13952 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13953 V1, V2, DAG, Subtarget))
13954 return V;
13955
13956 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13957 Zeroable, Subtarget, DAG))
13958 return Blend;
13959 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13960}
13961
13962/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13963static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13964 const APInt &Zeroable,
13965 SDValue V1, SDValue V2,
13966 const X86Subtarget &Subtarget,
13967 SelectionDAG &DAG) {
13968 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13968, __extension__ __PRETTY_FUNCTION__))
;
13969 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13969, __extension__ __PRETTY_FUNCTION__))
;
13970 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13970, __extension__ __PRETTY_FUNCTION__))
;
13971 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 13971, __extension__ __PRETTY_FUNCTION__))
;
13972
13973 // Whenever we can lower this as a zext, that instruction is strictly faster
13974 // than any alternative. It also allows us to fold memory operands into the
13975 // shuffle in many cases.
13976 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13977 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13978 return ZExt;
13979
13980 // Use dedicated unpack instructions for masks that match their pattern.
13981 if (SDValue V =
13982 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13983 return V;
13984
13985 // Try to use shift instructions.
13986 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13987 Zeroable, Subtarget, DAG))
13988 return Shift;
13989
13990 // Try to use byte rotation instructions.
13991 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13992 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13993 return Rotate;
13994
13995 if (V2.isUndef()) {
13996 SmallVector<int, 8> RepeatedMask;
13997 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13998 // As this is a single-input shuffle, the repeated mask should be
13999 // a strictly valid v8i16 mask that we can pass through to the v8i16
14000 // lowering to handle even the v32 case.
14001 return lowerV8I16GeneralSingleInputVectorShuffle(
14002 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14003 }
14004 }
14005
14006 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14007 Zeroable, Subtarget, DAG))
14008 return Blend;
14009
14010 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14011}
14012
14013/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14014static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14015 const APInt &Zeroable,
14016 SDValue V1, SDValue V2,
14017 const X86Subtarget &Subtarget,
14018 SelectionDAG &DAG) {
14019 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __extension__ __PRETTY_FUNCTION__))
;
14020 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14020, __extension__ __PRETTY_FUNCTION__))
;
14021 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14021, __extension__ __PRETTY_FUNCTION__))
;
14022 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14022, __extension__ __PRETTY_FUNCTION__))
;
14023
14024 // Whenever we can lower this as a zext, that instruction is strictly faster
14025 // than any alternative. It also allows us to fold memory operands into the
14026 // shuffle in many cases.
14027 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14028 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14029 return ZExt;
14030
14031 // Use dedicated unpack instructions for masks that match their pattern.
14032 if (SDValue V =
14033 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14034 return V;
14035
14036 // Try to use shift instructions.
14037 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14038 Zeroable, Subtarget, DAG))
14039 return Shift;
14040
14041 // Try to use byte rotation instructions.
14042 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14043 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14044 return Rotate;
14045
14046 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14047 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14048 return PSHUFB;
14049
14050 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14051 if (Subtarget.hasVBMI())
14052 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14053
14054 // Try to create an in-lane repeating shuffle mask and then shuffle the
14055 // the results into the target lanes.
14056 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14057 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14058 return V;
14059
14060 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14061 Zeroable, Subtarget, DAG))
14062 return Blend;
14063
14064 // FIXME: Implement direct support for this type!
14065 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14066}
14067
14068/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14069///
14070/// This routine either breaks down the specific type of a 512-bit x86 vector
14071/// shuffle or splits it into two 256-bit shuffles and fuses the results back
14072/// together based on the available instructions.
14073static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14074 MVT VT, SDValue V1, SDValue V2,
14075 const APInt &Zeroable,
14076 const X86Subtarget &Subtarget,
14077 SelectionDAG &DAG) {
14078 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14079, __extension__ __PRETTY_FUNCTION__))
14079 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14079, __extension__ __PRETTY_FUNCTION__))
;
14080
14081 // If we have a single input to the zero element, insert that into V1 if we
14082 // can do so cheaply.
14083 int NumElts = Mask.size();
14084 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14085
14086 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14087 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14088 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14089 return Insertion;
14090
14091 // Handle special cases where the lower or upper half is UNDEF.
14092 if (SDValue V =
14093 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14094 return V;
14095
14096 // Check for being able to broadcast a single element.
14097 if (SDValue Broadcast =
14098 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14099 return Broadcast;
14100
14101 // Dispatch to each element type for lowering. If we don't have support for
14102 // specific element type shuffles at 512 bits, immediately split them and
14103 // lower them. Each lowering routine of a given type is allowed to assume that
14104 // the requisite ISA extensions for that element type are available.
14105 switch (VT.SimpleTy) {
14106 case MVT::v8f64:
14107 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14108 case MVT::v16f32:
14109 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14110 case MVT::v8i64:
14111 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14112 case MVT::v16i32:
14113 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14114 case MVT::v32i16:
14115 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14116 case MVT::v64i8:
14117 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14118
14119 default:
14120 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14120)
;
14121 }
14122}
14123
14124// Lower vXi1 vector shuffles.
14125// There is no a dedicated instruction on AVX-512 that shuffles the masks.
14126// The only way to shuffle bits is to sign-extend the mask vector to SIMD
14127// vector, shuffle and then truncate it back.
14128static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14129 MVT VT, SDValue V1, SDValue V2,
14130 const X86Subtarget &Subtarget,
14131 SelectionDAG &DAG) {
14132 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14133, __extension__ __PRETTY_FUNCTION__))
14133 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14133, __extension__ __PRETTY_FUNCTION__))
;
14134 MVT ExtVT;
14135 switch (VT.SimpleTy) {
14136 default:
14137 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14137)
;
14138 case MVT::v2i1:
14139 ExtVT = MVT::v2i64;
14140 break;
14141 case MVT::v4i1:
14142 ExtVT = MVT::v4i32;
14143 break;
14144 case MVT::v8i1:
14145 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
14146 break;
14147 case MVT::v16i1:
14148 ExtVT = MVT::v16i32;
14149 break;
14150 case MVT::v32i1:
14151 ExtVT = MVT::v32i16;
14152 break;
14153 case MVT::v64i1:
14154 ExtVT = MVT::v64i8;
14155 break;
14156 }
14157
14158 if (ISD::isBuildVectorAllZeros(V1.getNode()))
14159 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14160 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
14161 V1 = getOnesVector(ExtVT, DAG, DL);
14162 else
14163 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14164
14165 if (V2.isUndef())
14166 V2 = DAG.getUNDEF(ExtVT);
14167 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
14168 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14169 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
14170 V2 = getOnesVector(ExtVT, DAG, DL);
14171 else
14172 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14173
14174 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14175 // i1 was sign extended we can use X86ISD::CVT2MASK.
14176 int NumElems = VT.getVectorNumElements();
14177 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14178 (Subtarget.hasDQI() && (NumElems < 32)))
14179 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
14180
14181 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14182}
14183
14184/// Helper function that returns true if the shuffle mask should be
14185/// commuted to improve canonicalization.
14186static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14187 int NumElements = Mask.size();
14188
14189 int NumV1Elements = 0, NumV2Elements = 0;
14190 for (int M : Mask)
14191 if (M < 0)
14192 continue;
14193 else if (M < NumElements)
14194 ++NumV1Elements;
14195 else
14196 ++NumV2Elements;
14197
14198 // Commute the shuffle as needed such that more elements come from V1 than
14199 // V2. This allows us to match the shuffle pattern strictly on how many
14200 // elements come from V1 without handling the symmetric cases.
14201 if (NumV2Elements > NumV1Elements)
14202 return true;
14203
14204 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14204, __extension__ __PRETTY_FUNCTION__))
;
14205
14206 if (NumV2Elements == 0)
14207 return false;
14208
14209 // When the number of V1 and V2 elements are the same, try to minimize the
14210 // number of uses of V2 in the low half of the vector. When that is tied,
14211 // ensure that the sum of indices for V1 is equal to or lower than the sum
14212 // indices for V2. When those are equal, try to ensure that the number of odd
14213 // indices for V1 is lower than the number of odd indices for V2.
14214 if (NumV1Elements == NumV2Elements) {
14215 int LowV1Elements = 0, LowV2Elements = 0;
14216 for (int M : Mask.slice(0, NumElements / 2))
14217 if (M >= NumElements)
14218 ++LowV2Elements;
14219 else if (M >= 0)
14220 ++LowV1Elements;
14221 if (LowV2Elements > LowV1Elements)
14222 return true;
14223 if (LowV2Elements == LowV1Elements) {
14224 int SumV1Indices = 0, SumV2Indices = 0;
14225 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14226 if (Mask[i] >= NumElements)
14227 SumV2Indices += i;
14228 else if (Mask[i] >= 0)
14229 SumV1Indices += i;
14230 if (SumV2Indices < SumV1Indices)
14231 return true;
14232 if (SumV2Indices == SumV1Indices) {
14233 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14234 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14235 if (Mask[i] >= NumElements)
14236 NumV2OddIndices += i % 2;
14237 else if (Mask[i] >= 0)
14238 NumV1OddIndices += i % 2;
14239 if (NumV2OddIndices < NumV1OddIndices)
14240 return true;
14241 }
14242 }
14243 }
14244
14245 return false;
14246}
14247
14248/// \brief Top-level lowering for x86 vector shuffles.
14249///
14250/// This handles decomposition, canonicalization, and lowering of all x86
14251/// vector shuffles. Most of the specific lowering strategies are encapsulated
14252/// above in helper routines. The canonicalization attempts to widen shuffles
14253/// to involve fewer lanes of wider elements, consolidate symmetric patterns
14254/// s.t. only one of the two inputs needs to be tested, etc.
14255static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14256 SelectionDAG &DAG) {
14257 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14258 ArrayRef<int> Mask = SVOp->getMask();
14259 SDValue V1 = Op.getOperand(0);
14260 SDValue V2 = Op.getOperand(1);
14261 MVT VT = Op.getSimpleValueType();
14262 int NumElements = VT.getVectorNumElements();
14263 SDLoc DL(Op);
14264 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14265
14266 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14267, __extension__ __PRETTY_FUNCTION__))
14267 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14267, __extension__ __PRETTY_FUNCTION__))
;
14268
14269 bool V1IsUndef = V1.isUndef();
14270 bool V2IsUndef = V2.isUndef();
14271 if (V1IsUndef && V2IsUndef)
14272 return DAG.getUNDEF(VT);
14273
14274 // When we create a shuffle node we put the UNDEF node to second operand,
14275 // but in some cases the first operand may be transformed to UNDEF.
14276 // In this case we should just commute the node.
14277 if (V1IsUndef)
14278 return DAG.getCommutedVectorShuffle(*SVOp);
14279
14280 // Check for non-undef masks pointing at an undef vector and make the masks
14281 // undef as well. This makes it easier to match the shuffle based solely on
14282 // the mask.
14283 if (V2IsUndef)
14284 for (int M : Mask)
14285 if (M >= NumElements) {
14286 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14287 for (int &M : NewMask)
14288 if (M >= NumElements)
14289 M = -1;
14290 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14291 }
14292
14293 // Check for illegal shuffle mask element index values.
14294 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14295 assert(llvm::all_of(Mask,(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__))
14296 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__))
14297 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(Mask, [&](int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14297, __extension__ __PRETTY_FUNCTION__))
;
14298
14299 // We actually see shuffles that are entirely re-arrangements of a set of
14300 // zero inputs. This mostly happens while decomposing complex shuffles into
14301 // simple ones. Directly lower these as a buildvector of zeros.
14302 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14303 if (Zeroable.isAllOnesValue())
14304 return getZeroVector(VT, Subtarget, DAG, DL);
14305
14306 // Try to collapse shuffles into using a vector type with fewer elements but
14307 // wider element types. We cap this to not form integers or floating point
14308 // elements wider than 64 bits, but it might be interesting to form i128
14309 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14310 SmallVector<int, 16> WidenedMask;
14311 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14312 canWidenShuffleElements(Mask, WidenedMask)) {
14313 MVT NewEltVT = VT.isFloatingPoint()
14314 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14315 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14316 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14317 // Make sure that the new vector type is legal. For example, v2f64 isn't
14318 // legal on SSE1.
14319 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14320 V1 = DAG.getBitcast(NewVT, V1);
14321 V2 = DAG.getBitcast(NewVT, V2);
14322 return DAG.getBitcast(
14323 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14324 }
14325 }
14326
14327 // Commute the shuffle if it will improve canonicalization.
14328 if (canonicalizeShuffleMaskWithCommute(Mask))
14329 return DAG.getCommutedVectorShuffle(*SVOp);
14330
14331 // For each vector width, delegate to a specialized lowering routine.
14332 if (VT.is128BitVector())
14333 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14334 DAG);
14335
14336 if (VT.is256BitVector())
14337 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14338 DAG);
14339
14340 if (VT.is512BitVector())
14341 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14342 DAG);
14343
14344 if (Is1BitVector)
14345 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14346
14347 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14347)
;
14348}
14349
14350/// \brief Try to lower a VSELECT instruction to a vector shuffle.
14351static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14352 const X86Subtarget &Subtarget,
14353 SelectionDAG &DAG) {
14354 SDValue Cond = Op.getOperand(0);
14355 SDValue LHS = Op.getOperand(1);
14356 SDValue RHS = Op.getOperand(2);
14357 SDLoc dl(Op);
14358 MVT VT = Op.getSimpleValueType();
14359
14360 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14361 return SDValue();
14362 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14363
14364 // Only non-legal VSELECTs reach this lowering, convert those into generic
14365 // shuffles and re-use the shuffle lowering path for blends.
14366 SmallVector<int, 32> Mask;
14367 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14368 SDValue CondElt = CondBV->getOperand(i);
14369 Mask.push_back(
14370 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14371 : -1);
14372 }
14373 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14374}
14375
14376SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14377 // A vselect where all conditions and data are constants can be optimized into
14378 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14379 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14380 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14381 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14382 return SDValue();
14383
14384 // Try to lower this to a blend-style vector shuffle. This can handle all
14385 // constant condition cases.
14386 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14387 return BlendOp;
14388
14389 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14390 // with patterns on the mask registers on AVX-512.
14391 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14392 return Op;
14393
14394 // Variable blends are only legal from SSE4.1 onward.
14395 if (!Subtarget.hasSSE41())
14396 return SDValue();
14397
14398 SDLoc dl(Op);
14399 MVT VT = Op.getSimpleValueType();
14400
14401 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14402 // into an i1 condition so that we can use the mask-based 512-bit blend
14403 // instructions.
14404 if (VT.getSizeInBits() == 512) {
14405 SDValue Cond = Op.getOperand(0);
14406 // The vNi1 condition case should be handled above as it can be trivially
14407 // lowered.
14408 assert(Cond.getValueType().getScalarSizeInBits() ==(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__))
14409 VT.getScalarSizeInBits() &&(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__))
14410 "Should have a size-matched integer condition!")(static_cast <bool> (Cond.getValueType().getScalarSizeInBits
() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"
) ? void (0) : __assert_fail ("Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && \"Should have a size-matched integer condition!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14410, __extension__ __PRETTY_FUNCTION__))
;
14411 // Build a mask by testing the condition against itself (tests for zero).
14412 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14413 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14414 // Now return a new VSELECT using the mask.
14415 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14416 }
14417
14418 // Only some types will be legal on some subtargets. If we can emit a legal
14419 // VSELECT-matching blend, return Op, and but if we need to expand, return
14420 // a null value.
14421 switch (VT.SimpleTy) {
14422 default:
14423 // Most of the vector types have blends past SSE4.1.
14424 return Op;
14425
14426 case MVT::v32i8:
14427 // The byte blends for AVX vectors were introduced only in AVX2.
14428 if (Subtarget.hasAVX2())
14429 return Op;
14430
14431 return SDValue();
14432
14433 case MVT::v8i16:
14434 case MVT::v16i16:
14435 // FIXME: We should custom lower this by fixing the condition and using i8
14436 // blends.
14437 return SDValue();
14438 }
14439}
14440
14441static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14442 MVT VT = Op.getSimpleValueType();
14443 SDLoc dl(Op);
14444
14445 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14446 return SDValue();
14447
14448 if (VT.getSizeInBits() == 8) {
14449 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14450 Op.getOperand(0), Op.getOperand(1));
14451 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14452 }
14453
14454 if (VT == MVT::f32) {
14455 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14456 // the result back to FR32 register. It's only worth matching if the
14457 // result has a single use which is a store or a bitcast to i32. And in
14458 // the case of a store, it's not worth it if the index is a constant 0,
14459 // because a MOVSSmr can be used instead, which is smaller and faster.
14460 if (!Op.hasOneUse())
14461 return SDValue();
14462 SDNode *User = *Op.getNode()->use_begin();
14463 if ((User->getOpcode() != ISD::STORE ||
14464 isNullConstant(Op.getOperand(1))) &&
14465 (User->getOpcode() != ISD::BITCAST ||
14466 User->getValueType(0) != MVT::i32))
14467 return SDValue();
14468 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14469 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14470 Op.getOperand(1));
14471 return DAG.getBitcast(MVT::f32, Extract);
14472 }
14473
14474 if (VT == MVT::i32 || VT == MVT::i64) {
14475 // ExtractPS/pextrq works with constant index.
14476 if (isa<ConstantSDNode>(Op.getOperand(1)))
14477 return Op;
14478 }
14479
14480 return SDValue();
14481}
14482
14483/// Extract one bit from mask vector, like v16i1 or v8i1.
14484/// AVX-512 feature.
14485static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
14486 const X86Subtarget &Subtarget) {
14487 SDValue Vec = Op.getOperand(0);
14488 SDLoc dl(Vec);
14489 MVT VecVT = Vec.getSimpleValueType();
14490 SDValue Idx = Op.getOperand(1);
14491 MVT EltVT = Op.getSimpleValueType();
14492
14493 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14494, __extension__ __PRETTY_FUNCTION__))
14494 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14494, __extension__ __PRETTY_FUNCTION__))
;
14495
14496 // variable index can't be handled in mask registers,
14497 // extend vector to VR512/128
14498 if (!isa<ConstantSDNode>(Idx)) {
14499 unsigned NumElts = VecVT.getVectorNumElements();
14500 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14501 // than extending to 128/256bit.
14502 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14503 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14504 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14505 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14506 ExtVT.getVectorElementType(), Ext, Idx);
14507 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14508 }
14509
14510 // If the kshift instructions of the correct width aren't natively supported
14511 // then we need to promote the vector to the native size to get the correct
14512 // zeroing behavior.
14513 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14514 (VecVT.getVectorNumElements() < 8)) {
14515 VecVT = MVT::v16i1;
14516 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14517 DAG.getUNDEF(VecVT),
14518 Vec,
14519 DAG.getIntPtrConstant(0, dl));
14520 }
14521
14522 // Use kshiftlw/rw instruction.
14523 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14524 unsigned MaxShift = VecVT.getVectorNumElements() - 1;
14525 if (MaxShift - IdxVal)
14526 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14527 DAG.getConstant(MaxShift - IdxVal, dl, MVT::i8));
14528 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14529 DAG.getConstant(MaxShift, dl, MVT::i8));
14530 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14531 DAG.getIntPtrConstant(0, dl));
14532}
14533
14534SDValue
14535X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14536 SelectionDAG &DAG) const {
14537 SDLoc dl(Op);
14538 SDValue Vec = Op.getOperand(0);
14539 MVT VecVT = Vec.getSimpleValueType();
14540 SDValue Idx = Op.getOperand(1);
14541
14542 if (VecVT.getVectorElementType() == MVT::i1)
14543 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
14544
14545 if (!isa<ConstantSDNode>(Idx)) {
14546 // Its more profitable to go through memory (1 cycles throughput)
14547 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14548 // IACA tool was used to get performance estimation
14549 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14550 //
14551 // example : extractelement <16 x i8> %a, i32 %i
14552 //
14553 // Block Throughput: 3.00 Cycles
14554 // Throughput Bottleneck: Port5
14555 //
14556 // | Num Of | Ports pressure in cycles | |
14557 // | Uops | 0 - DV | 5 | 6 | 7 | |
14558 // ---------------------------------------------
14559 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14560 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14561 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14562 // Total Num Of Uops: 4
14563 //
14564 //
14565 // Block Throughput: 1.00 Cycles
14566 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14567 //
14568 // | | Ports pressure in cycles | |
14569 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14570 // ---------------------------------------------------------
14571 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14572 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14573 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14574 // Total Num Of Uops: 4
14575
14576 return SDValue();
14577 }
14578
14579 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14580
14581 // If this is a 256-bit vector result, first extract the 128-bit vector and
14582 // then extract the element from the 128-bit vector.
14583 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14584 // Get the 128-bit vector.
14585 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14586 MVT EltVT = VecVT.getVectorElementType();
14587
14588 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14589 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14589, __extension__ __PRETTY_FUNCTION__))
;
14590
14591 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14592 // this can be done with a mask.
14593 IdxVal &= ElemsPerChunk - 1;
14594 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14595 DAG.getConstant(IdxVal, dl, MVT::i32));
14596 }
14597
14598 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14598, __extension__ __PRETTY_FUNCTION__))
;
14599
14600 MVT VT = Op.getSimpleValueType();
14601
14602 if (VT.getSizeInBits() == 16) {
14603 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14604 // we're going to zero extend the register or fold the store (SSE41 only).
14605 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14606 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14607 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14608 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14609 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14610
14611 // Transform it so it match pextrw which produces a 32-bit result.
14612 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14613 Op.getOperand(0), Op.getOperand(1));
14614 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14615 }
14616
14617 if (Subtarget.hasSSE41())
14618 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14619 return Res;
14620
14621 // TODO: We only extract a single element from v16i8, we can probably afford
14622 // to be more aggressive here before using the default approach of spilling to
14623 // stack.
14624 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14625 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14626 int DWordIdx = IdxVal / 4;
14627 if (DWordIdx == 0) {
14628 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14629 DAG.getBitcast(MVT::v4i32, Vec),
14630 DAG.getIntPtrConstant(DWordIdx, dl));
14631 int ShiftVal = (IdxVal % 4) * 8;
14632 if (ShiftVal != 0)
14633 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14634 DAG.getConstant(ShiftVal, dl, MVT::i32));
14635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14636 }
14637
14638 int WordIdx = IdxVal / 2;
14639 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14640 DAG.getBitcast(MVT::v8i16, Vec),
14641 DAG.getIntPtrConstant(WordIdx, dl));
14642 int ShiftVal = (IdxVal % 2) * 8;
14643 if (ShiftVal != 0)
14644 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14645 DAG.getConstant(ShiftVal, dl, MVT::i16));
14646 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14647 }
14648
14649 if (VT.getSizeInBits() == 32) {
14650 if (IdxVal == 0)
14651 return Op;
14652
14653 // SHUFPS the element to the lowest double word, then movss.
14654 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14655 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14656 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14657 DAG.getIntPtrConstant(0, dl));
14658 }
14659
14660 if (VT.getSizeInBits() == 64) {
14661 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14662 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14663 // to match extract_elt for f64.
14664 if (IdxVal == 0)
14665 return Op;
14666
14667 // UNPCKHPD the element to the lowest double word, then movsd.
14668 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14669 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14670 int Mask[2] = { 1, -1 };
14671 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14672 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14673 DAG.getIntPtrConstant(0, dl));
14674 }
14675
14676 return SDValue();
14677}
14678
14679/// Insert one bit to mask vector, like v16i1 or v8i1.
14680/// AVX-512 feature.
14681static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
14682 const X86Subtarget &Subtarget) {
14683 SDLoc dl(Op);
14684 SDValue Vec = Op.getOperand(0);
14685 SDValue Elt = Op.getOperand(1);
14686 SDValue Idx = Op.getOperand(2);
14687 MVT VecVT = Vec.getSimpleValueType();
14688
14689 if (!isa<ConstantSDNode>(Idx)) {
14690 // Non constant index. Extend source and destination,
14691 // insert element and then truncate the result.
14692 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14693 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14694 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14695 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14696 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14697 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14698 }
14699
14700 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14701 unsigned NumElems = VecVT.getVectorNumElements();
14702
14703 // If the kshift instructions of the correct width aren't natively supported
14704 // then we need to promote the vector to the native size to get the correct
14705 // zeroing behavior.
14706 if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
14707 // Need to promote to v16i1, do the insert, then extract back.
14708 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
14709 DAG.getUNDEF(MVT::v16i1), Vec,
14710 DAG.getIntPtrConstant(0, dl));
14711 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
14712 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
14713 DAG.getIntPtrConstant(0, dl));
14714 }
14715
14716 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14717
14718 if (Vec.isUndef()) {
14719 if (IdxVal)
14720 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14721 DAG.getConstant(IdxVal, dl, MVT::i8));
14722 return EltInVec;
14723 }
14724
14725 // Insertion of one bit into first position
14726 if (IdxVal == 0 ) {
14727 // Clean top bits of vector.
14728 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14729 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14730 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14731 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14732 // Clean the first bit in source vector.
14733 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14734 DAG.getConstant(1 , dl, MVT::i8));
14735 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14736 DAG.getConstant(1, dl, MVT::i8));
14737
14738 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14739 }
14740 // Insertion of one bit into last position
14741 if (IdxVal == NumElems - 1) {
14742 // Move the bit to the last position inside the vector.
14743 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14744 DAG.getConstant(IdxVal, dl, MVT::i8));
14745 // Clean the last bit in the source vector.
14746 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14747 DAG.getConstant(1, dl, MVT::i8));
14748 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14749 DAG.getConstant(1 , dl, MVT::i8));
14750
14751 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14752 }
14753
14754 // Move the current value of the bit to be replace to bit 0.
14755 SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14756 DAG.getConstant(IdxVal, dl, MVT::i8));
14757 // Xor with the new bit.
14758 Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
14759 // Shift to MSB, filling bottom bits with 0.
14760 Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
14761 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14762 // Shift to the final position, filling upper bits with 0.
14763 Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
14764 DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
14765 // Xor with original vector to cancel out the original bit value that's still
14766 // present.
14767 return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
14768}
14769
14770SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14771 SelectionDAG &DAG) const {
14772 MVT VT = Op.getSimpleValueType();
14773 MVT EltVT = VT.getVectorElementType();
14774 unsigned NumElts = VT.getVectorNumElements();
14775
14776 if (EltVT == MVT::i1)
14777 return InsertBitToMaskVector(Op, DAG, Subtarget);
14778
14779 SDLoc dl(Op);
14780 SDValue N0 = Op.getOperand(0);
14781 SDValue N1 = Op.getOperand(1);
14782 SDValue N2 = Op.getOperand(2);
14783 if (!isa<ConstantSDNode>(N2))
14784 return SDValue();
14785 auto *N2C = cast<ConstantSDNode>(N2);
14786 unsigned IdxVal = N2C->getZExtValue();
14787
14788 bool IsZeroElt = X86::isZeroNode(N1);
14789 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14790
14791 // If we are inserting a element, see if we can do this more efficiently with
14792 // a blend shuffle with a rematerializable vector than a costly integer
14793 // insertion.
14794 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14795 16 <= EltVT.getSizeInBits()) {
14796 SmallVector<int, 8> BlendMask;
14797 for (unsigned i = 0; i != NumElts; ++i)
14798 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14799 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14800 : getOnesVector(VT, DAG, dl);
14801 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14802 }
14803
14804 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14805 // into that, and then insert the subvector back into the result.
14806 if (VT.is256BitVector() || VT.is512BitVector()) {
14807 // With a 256-bit vector, we can insert into the zero element efficiently
14808 // using a blend if we have AVX or AVX2 and the right data type.
14809 if (VT.is256BitVector() && IdxVal == 0) {
14810 // TODO: It is worthwhile to cast integer to floating point and back
14811 // and incur a domain crossing penalty if that's what we'll end up
14812 // doing anyway after extracting to a 128-bit vector.
14813 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14814 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14815 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14816 N2 = DAG.getIntPtrConstant(1, dl);
14817 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14818 }
14819 }
14820
14821 // Get the desired 128-bit vector chunk.
14822 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14823
14824 // Insert the element into the desired chunk.
14825 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14826 assert(isPowerOf2_32(NumEltsIn128))(static_cast <bool> (isPowerOf2_32(NumEltsIn128)) ? void
(0) : __assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14826, __extension__ __PRETTY_FUNCTION__))
;
14827 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14828 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14829
14830 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14831 DAG.getConstant(IdxIn128, dl, MVT::i32));
14832
14833 // Insert the changed part back into the bigger vector
14834 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14835 }
14836 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14836, __extension__ __PRETTY_FUNCTION__))
;
14837
14838 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14839 // argument. SSE41 required for pinsrb.
14840 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14841 unsigned Opc;
14842 if (VT == MVT::v8i16) {
14843 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14843, __extension__ __PRETTY_FUNCTION__))
;
14844 Opc = X86ISD::PINSRW;
14845 } else {
14846 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14846, __extension__ __PRETTY_FUNCTION__))
;
14847 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14847, __extension__ __PRETTY_FUNCTION__))
;
14848 Opc = X86ISD::PINSRB;
14849 }
14850
14851 if (N1.getValueType() != MVT::i32)
14852 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14853 if (N2.getValueType() != MVT::i32)
14854 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14855 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14856 }
14857
14858 if (Subtarget.hasSSE41()) {
14859 if (EltVT == MVT::f32) {
14860 // Bits [7:6] of the constant are the source select. This will always be
14861 // zero here. The DAG Combiner may combine an extract_elt index into
14862 // these bits. For example (insert (extract, 3), 2) could be matched by
14863 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14864 // Bits [5:4] of the constant are the destination select. This is the
14865 // value of the incoming immediate.
14866 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14867 // combine either bitwise AND or insert of float 0.0 to set these bits.
14868
14869 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14870 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14871 // If this is an insertion of 32-bits into the low 32-bits of
14872 // a vector, we prefer to generate a blend with immediate rather
14873 // than an insertps. Blends are simpler operations in hardware and so
14874 // will always have equal or better performance than insertps.
14875 // But if optimizing for size and there's a load folding opportunity,
14876 // generate insertps because blendps does not have a 32-bit memory
14877 // operand form.
14878 N2 = DAG.getIntPtrConstant(1, dl);
14879 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14880 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14881 }
14882 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14883 // Create this as a scalar to vector..
14884 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14885 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14886 }
14887
14888 // PINSR* works with constant index.
14889 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14890 return Op;
14891 }
14892
14893 return SDValue();
14894}
14895
14896static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14897 SelectionDAG &DAG) {
14898 SDLoc dl(Op);
14899 MVT OpVT = Op.getSimpleValueType();
14900
14901 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14902 // combines.
14903 if (X86::isZeroNode(Op.getOperand(0)))
14904 return getZeroVector(OpVT, Subtarget, DAG, dl);
14905
14906 // If this is a 256-bit vector result, first insert into a 128-bit
14907 // vector and then insert into the 256-bit vector.
14908 if (!OpVT.is128BitVector()) {
14909 // Insert into a 128-bit vector.
14910 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14911 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14912 OpVT.getVectorNumElements() / SizeFactor);
14913
14914 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14915
14916 // Insert the 128-bit vector.
14917 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14918 }
14919 assert(OpVT.is128BitVector() && "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14919, __extension__ __PRETTY_FUNCTION__))
;
14920
14921 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14922 if (OpVT == MVT::v4i32)
14923 return Op;
14924
14925 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14926 return DAG.getBitcast(
14927 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14928}
14929
14930// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14931// simple superregister reference or explicit instructions to insert
14932// the upper bits of a vector.
14933static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14934 SelectionDAG &DAG) {
14935 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 14935, __extension__ __PRETTY_FUNCTION__))
;
14936
14937 return insert1BitVector(Op, DAG, Subtarget);
14938}
14939
14940// Returns the appropriate wrapper opcode for a global reference.
14941unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14942 // References to absolute symbols are never PC-relative.
14943 if (GV && GV->isAbsoluteSymbolRef())
14944 return X86ISD::Wrapper;
14945
14946 CodeModel::Model M = getTargetMachine().getCodeModel();
14947 if (Subtarget.isPICStyleRIPRel() &&
14948 (M == CodeModel::Small || M == CodeModel::Kernel))
14949 return X86ISD::WrapperRIP;
14950
14951 return X86ISD::Wrapper;
14952}
14953
14954// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14955// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14956// one of the above mentioned nodes. It has to be wrapped because otherwise
14957// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14958// be used to form addressing mode. These wrapped nodes will be selected
14959// into MOV32ri.
14960SDValue
14961X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14962 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14963
14964 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14965 // global base reg.
14966 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14967
14968 auto PtrVT = getPointerTy(DAG.getDataLayout());
14969 SDValue Result = DAG.getTargetConstantPool(
14970 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14971 SDLoc DL(CP);
14972 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14973 // With PIC, the address is actually $g + Offset.
14974 if (OpFlag) {
14975 Result =
14976 DAG.getNode(ISD::ADD, DL, PtrVT,
14977 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14978 }
14979
14980 return Result;
14981}
14982
14983SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14984 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14985
14986 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14987 // global base reg.
14988 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14989
14990 auto PtrVT = getPointerTy(DAG.getDataLayout());
14991 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14992 SDLoc DL(JT);
14993 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14994
14995 // With PIC, the address is actually $g + Offset.
14996 if (OpFlag)
14997 Result =
14998 DAG.getNode(ISD::ADD, DL, PtrVT,
14999 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15000
15001 return Result;
15002}
15003
15004SDValue
15005X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15006 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15007
15008 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15009 // global base reg.
15010 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
15011 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15012
15013 auto PtrVT = getPointerTy(DAG.getDataLayout());
15014 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15015
15016 SDLoc DL(Op);
15017 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15018
15019 // With PIC, the address is actually $g + Offset.
15020 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15021 Result =
15022 DAG.getNode(ISD::ADD, DL, PtrVT,
15023 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15024 }
15025
15026 // For symbols that require a load from a stub to get the address, emit the
15027 // load.
15028 if (isGlobalStubReference(OpFlag))
15029 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15030 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15031
15032 return Result;
15033}
15034
15035SDValue
15036X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15037 // Create the TargetBlockAddressAddress node.
15038 unsigned char OpFlags =
15039 Subtarget.classifyBlockAddressReference();
15040 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15041 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15042 SDLoc dl(Op);
15043 auto PtrVT = getPointerTy(DAG.getDataLayout());
15044 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15045 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15046
15047 // With PIC, the address is actually $g + Offset.
15048 if (isGlobalRelativeToPICBase(OpFlags)) {
15049 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15050 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15051 }
15052
15053 return Result;
15054}
15055
15056SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15057 const SDLoc &dl, int64_t Offset,
15058 SelectionDAG &DAG) const {
15059 // Create the TargetGlobalAddress node, folding in the constant
15060 // offset if it is legal.
15061 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15062 CodeModel::Model M = DAG.getTarget().getCodeModel();
15063 auto PtrVT = getPointerTy(DAG.getDataLayout());
15064 SDValue Result;
15065 if (OpFlags == X86II::MO_NO_FLAG &&
15066 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15067 // A direct static reference to a global.
15068 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15069 Offset = 0;
15070 } else {
15071 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15072 }
15073
15074 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15075
15076 // With PIC, the address is actually $g + Offset.
15077 if (isGlobalRelativeToPICBase(OpFlags)) {
15078 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15079 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15080 }
15081
15082 // For globals that require a load from a stub to get the address, emit the
15083 // load.
15084 if (isGlobalStubReference(OpFlags))
15085 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15086 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15087
15088 // If there was a non-zero offset that we didn't fold, create an explicit
15089 // addition for it.
15090 if (Offset != 0)
15091 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15092 DAG.getConstant(Offset, dl, PtrVT));
15093
15094 return Result;
15095}
15096
15097SDValue
15098X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15099 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15100 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15101 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15102}
15103
15104static SDValue
15105GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15106 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15107 unsigned char OperandFlags, bool LocalDynamic = false) {
15108 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15109 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15110 SDLoc dl(GA);
15111 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15112 GA->getValueType(0),
15113 GA->getOffset(),
15114 OperandFlags);
15115
15116 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15117 : X86ISD::TLSADDR;
15118
15119 if (InFlag) {
15120 SDValue Ops[] = { Chain, TGA, *InFlag };
15121 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15122 } else {
15123 SDValue Ops[] = { Chain, TGA };
15124 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15125 }
15126
15127 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15128 MFI.setAdjustsStack(true);
15129 MFI.setHasCalls(true);
15130
15131 SDValue Flag = Chain.getValue(1);
15132 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15133}
15134
15135// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15136static SDValue
15137LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15138 const EVT PtrVT) {
15139 SDValue InFlag;
15140 SDLoc dl(GA); // ? function entry point might be better
15141 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15142 DAG.getNode(X86ISD::GlobalBaseReg,
15143 SDLoc(), PtrVT), InFlag);
15144 InFlag = Chain.getValue(1);
15145
15146 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15147}
15148
15149// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15150static SDValue
15151LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15152 const EVT PtrVT) {
15153 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15154 X86::RAX, X86II::MO_TLSGD);
15155}
15156
15157static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15158 SelectionDAG &DAG,
15159 const EVT PtrVT,
15160 bool is64Bit) {
15161 SDLoc dl(GA);
15162
15163 // Get the start address of the TLS block for this module.
15164 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15165 .getInfo<X86MachineFunctionInfo>();
15166 MFI->incNumLocalDynamicTLSAccesses();
15167
15168 SDValue Base;
15169 if (is64Bit) {
15170 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15171 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15172 } else {
15173 SDValue InFlag;
15174 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15175 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15176 InFlag = Chain.getValue(1);
15177 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15178 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15179 }
15180
15181 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15182 // of Base.
15183
15184 // Build x@dtpoff.
15185 unsigned char OperandFlags = X86II::MO_DTPOFF;
15186 unsigned WrapperKind = X86ISD::Wrapper;
15187 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15188 GA->getValueType(0),
15189 GA->getOffset(), OperandFlags);
15190 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15191
15192 // Add x@dtpoff with the base.
15193 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15194}
15195
15196// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15197static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15198 const EVT PtrVT, TLSModel::Model model,
15199 bool is64Bit, bool isPIC) {
15200 SDLoc dl(GA);
15201
15202 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15203 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15204 is64Bit ? 257 : 256));
15205
15206 SDValue ThreadPointer =
15207 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15208 MachinePointerInfo(Ptr));
15209
15210 unsigned char OperandFlags = 0;
15211 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15212 // initialexec.
15213 unsigned WrapperKind = X86ISD::Wrapper;
15214 if (model == TLSModel::LocalExec) {
15215 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15216 } else if (model == TLSModel::InitialExec) {
15217 if (is64Bit) {
15218 OperandFlags = X86II::MO_GOTTPOFF;
15219 WrapperKind = X86ISD::WrapperRIP;
15220 } else {
15221 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15222 }
15223 } else {
15224 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15224)
;
15225 }
15226
15227 // emit "addl x@ntpoff,%eax" (local exec)
15228 // or "addl x@indntpoff,%eax" (initial exec)
15229 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15230 SDValue TGA =
15231 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15232 GA->getOffset(), OperandFlags);
15233 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15234
15235 if (model == TLSModel::InitialExec) {
15236 if (isPIC && !is64Bit) {
15237 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15238 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15239 Offset);
15240 }
15241
15242 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15243 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15244 }
15245
15246 // The address of the thread local variable is the add of the thread
15247 // pointer with the offset of the variable.
15248 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15249}
15250
15251SDValue
15252X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15253
15254 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15255
15256 if (DAG.getTarget().Options.EmulatedTLS)
15257 return LowerToTLSEmulatedModel(GA, DAG);
15258
15259 const GlobalValue *GV = GA->getGlobal();
15260 auto PtrVT = getPointerTy(DAG.getDataLayout());
15261 bool PositionIndependent = isPositionIndependent();
15262
15263 if (Subtarget.isTargetELF()) {
15264 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15265 switch (model) {
15266 case TLSModel::GeneralDynamic:
15267 if (Subtarget.is64Bit())
15268 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15269 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15270 case TLSModel::LocalDynamic:
15271 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15272 Subtarget.is64Bit());
15273 case TLSModel::InitialExec:
15274 case TLSModel::LocalExec:
15275 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15276 PositionIndependent);
15277 }
15278 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15278)
;
15279 }
15280
15281 if (Subtarget.isTargetDarwin()) {
15282 // Darwin only has one model of TLS. Lower to that.
15283 unsigned char OpFlag = 0;
15284 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15285 X86ISD::WrapperRIP : X86ISD::Wrapper;
15286
15287 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15288 // global base reg.
15289 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15290 if (PIC32)
15291 OpFlag = X86II::MO_TLVP_PIC_BASE;
15292 else
15293 OpFlag = X86II::MO_TLVP;
15294 SDLoc DL(Op);
15295 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15296 GA->getValueType(0),
15297 GA->getOffset(), OpFlag);
15298 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15299
15300 // With PIC32, the address is actually $g + Offset.
15301 if (PIC32)
15302 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15303 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15304 Offset);
15305
15306 // Lowering the machine isd will make sure everything is in the right
15307 // location.
15308 SDValue Chain = DAG.getEntryNode();
15309 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15310 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15311 SDValue Args[] = { Chain, Offset };
15312 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15313 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15314 DAG.getIntPtrConstant(0, DL, true),
15315 Chain.getValue(1), DL);
15316
15317 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15318 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15319 MFI.setAdjustsStack(true);
15320
15321 // And our return value (tls address) is in the standard call return value
15322 // location.
15323 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15324 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15325 }
15326
15327 if (Subtarget.isTargetKnownWindowsMSVC() ||
15328 Subtarget.isTargetWindowsItanium() ||
15329 Subtarget.isTargetWindowsGNU()) {
15330 // Just use the implicit TLS architecture
15331 // Need to generate something similar to:
15332 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15333 // ; from TEB
15334 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15335 // mov rcx, qword [rdx+rcx*8]
15336 // mov eax, .tls$:tlsvar
15337 // [rax+rcx] contains the address
15338 // Windows 64bit: gs:0x58
15339 // Windows 32bit: fs:__tls_array
15340
15341 SDLoc dl(GA);
15342 SDValue Chain = DAG.getEntryNode();
15343
15344 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15345 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15346 // use its literal value of 0x2C.
15347 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15348 ? Type::getInt8PtrTy(*DAG.getContext(),
15349 256)
15350 : Type::getInt32PtrTy(*DAG.getContext(),
15351 257));
15352
15353 SDValue TlsArray = Subtarget.is64Bit()
15354 ? DAG.getIntPtrConstant(0x58, dl)
15355 : (Subtarget.isTargetWindowsGNU()
15356 ? DAG.getIntPtrConstant(0x2C, dl)
15357 : DAG.getExternalSymbol("_tls_array", PtrVT));
15358
15359 SDValue ThreadPointer =
15360 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15361
15362 SDValue res;
15363 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15364 res = ThreadPointer;
15365 } else {
15366 // Load the _tls_index variable
15367 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15368 if (Subtarget.is64Bit())
15369 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15370 MachinePointerInfo(), MVT::i32);
15371 else
15372 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15373
15374 auto &DL = DAG.getDataLayout();
15375 SDValue Scale =
15376 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15377 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15378
15379 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15380 }
15381
15382 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15383
15384 // Get the offset of start of .tls section
15385 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15386 GA->getValueType(0),
15387 GA->getOffset(), X86II::MO_SECREL);
15388 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15389
15390 // The address of the thread local variable is the add of the thread
15391 // pointer with the offset of the variable.
15392 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15393 }
15394
15395 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15395)
;
15396}
15397
15398/// Lower SRA_PARTS and friends, which return two i32 values
15399/// and take a 2 x i32 value to shift plus a shift amount.
15400static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15401 assert(Op.getNumOperands() == 3 && "Not a double-shift!")(static_cast <bool> (Op.getNumOperands() == 3 &&
"Not a double-shift!") ? void (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15401, __extension__ __PRETTY_FUNCTION__))
;
15402 MVT VT = Op.getSimpleValueType();
15403 unsigned VTBits = VT.getSizeInBits();
15404 SDLoc dl(Op);
15405 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15406 SDValue ShOpLo = Op.getOperand(0);
15407 SDValue ShOpHi = Op.getOperand(1);
15408 SDValue ShAmt = Op.getOperand(2);
15409 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15410 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15411 // during isel.
15412 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15413 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15414 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15415 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15416 : DAG.getConstant(0, dl, VT);
15417
15418 SDValue Tmp2, Tmp3;
15419 if (Op.getOpcode() == ISD::SHL_PARTS) {
15420 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15421 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15422 } else {
15423 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15424 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15425 }
15426
15427 // If the shift amount is larger or equal than the width of a part we can't
15428 // rely on the results of shld/shrd. Insert a test and select the appropriate
15429 // values for large shift amounts.
15430 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15431 DAG.getConstant(VTBits, dl, MVT::i8));
15432 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15433 AndNode, DAG.getConstant(0, dl, MVT::i8));
15434
15435 SDValue Hi, Lo;
15436 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15437 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15438 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15439
15440 if (Op.getOpcode() == ISD::SHL_PARTS) {
15441 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15442 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15443 } else {
15444 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15445 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15446 }
15447
15448 SDValue Ops[2] = { Lo, Hi };
15449 return DAG.getMergeValues(Ops, dl);
15450}
15451
15452SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15453 SelectionDAG &DAG) const {
15454 SDValue Src = Op.getOperand(0);
15455 MVT SrcVT = Src.getSimpleValueType();
15456 MVT VT = Op.getSimpleValueType();
15457 SDLoc dl(Op);
15458
15459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15460 if (SrcVT.isVector()) {
15461 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15462 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15463 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15464 DAG.getUNDEF(SrcVT)));
15465 }
15466 if (SrcVT.getVectorElementType() == MVT::i1) {
15467 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15468 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15469 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15470 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15471 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15472 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15473 }
15474 return SDValue();
15475 }
15476
15477 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15478, __extension__ __PRETTY_FUNCTION__))
15478 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15478, __extension__ __PRETTY_FUNCTION__))
;
15479
15480 // These are really Legal; return the operand so the caller accepts it as
15481 // Legal.
15482 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15483 return Op;
15484 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15485 Subtarget.is64Bit()) {
15486 return Op;
15487 }
15488
15489 SDValue ValueToStore = Op.getOperand(0);
15490 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15491 !Subtarget.is64Bit())
15492 // Bitcasting to f64 here allows us to do a single 64-bit store from
15493 // an SSE register, avoiding the store forwarding penalty that would come
15494 // with two 32-bit stores.
15495 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15496
15497 unsigned Size = SrcVT.getSizeInBits()/8;
15498 MachineFunction &MF = DAG.getMachineFunction();
15499 auto PtrVT = getPointerTy(MF.getDataLayout());
15500 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15501 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15502 SDValue Chain = DAG.getStore(
15503 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15504 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15505 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15506}
15507
15508SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15509 SDValue StackSlot,
15510 SelectionDAG &DAG) const {
15511 // Build the FILD
15512 SDLoc DL(Op);
15513 SDVTList Tys;
15514 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15515 if (useSSE)
15516 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15517 else
15518 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15519
15520 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15521
15522 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15523 MachineMemOperand *MMO;
15524 if (FI) {
15525 int SSFI = FI->getIndex();
15526 MMO = DAG.getMachineFunction().getMachineMemOperand(
15527 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15528 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15529 } else {
15530 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15531 StackSlot = StackSlot.getOperand(1);
15532 }
15533 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15534 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15535 X86ISD::FILD, DL,
15536 Tys, Ops, SrcVT, MMO);
15537
15538 if (useSSE) {
15539 Chain = Result.getValue(1);
15540 SDValue InFlag = Result.getValue(2);
15541
15542 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15543 // shouldn't be necessary except that RFP cannot be live across
15544 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15545 MachineFunction &MF = DAG.getMachineFunction();
15546 unsigned SSFISize = Op.getValueSizeInBits()/8;
15547 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15548 auto PtrVT = getPointerTy(MF.getDataLayout());
15549 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15550 Tys = DAG.getVTList(MVT::Other);
15551 SDValue Ops[] = {
15552 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15553 };
15554 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15555 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15556 MachineMemOperand::MOStore, SSFISize, SSFISize);
15557
15558 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15559 Ops, Op.getValueType(), MMO);
15560 Result = DAG.getLoad(
15561 Op.getValueType(), DL, Chain, StackSlot,
15562 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15563 }
15564
15565 return Result;
15566}
15567
15568/// 64-bit unsigned integer to double expansion.
15569SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15570 SelectionDAG &DAG) const {
15571 // This algorithm is not obvious. Here it is what we're trying to output:
15572 /*
15573 movq %rax, %xmm0
15574 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15575 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15576 #ifdef __SSE3__
15577 haddpd %xmm0, %xmm0
15578 #else
15579 pshufd $0x4e, %xmm0, %xmm1
15580 addpd %xmm1, %xmm0
15581 #endif
15582 */
15583
15584 SDLoc dl(Op);
15585 LLVMContext *Context = DAG.getContext();
15586
15587 // Build some magic constants.
15588 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15589 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15590 auto PtrVT = getPointerTy(DAG.getDataLayout());
15591 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15592
15593 SmallVector<Constant*,2> CV1;
15594 CV1.push_back(
15595 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15596 APInt(64, 0x4330000000000000ULL))));
15597 CV1.push_back(
15598 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15599 APInt(64, 0x4530000000000000ULL))));
15600 Constant *C1 = ConstantVector::get(CV1);
15601 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15602
15603 // Load the 64-bit value into an XMM register.
15604 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15605 Op.getOperand(0));
15606 SDValue CLod0 =
15607 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15608 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15609 /* Alignment = */ 16);
15610 SDValue Unpck1 =
15611 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15612
15613 SDValue CLod1 =
15614 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15615 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15616 /* Alignment = */ 16);
15617 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15618 // TODO: Are there any fast-math-flags to propagate here?
15619 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15620 SDValue Result;
15621
15622 if (Subtarget.hasSSE3()) {
15623 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15624 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15625 } else {
15626 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15627 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15628 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15629 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15630 }
15631
15632 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15633 DAG.getIntPtrConstant(0, dl));
15634}
15635
15636/// 32-bit unsigned integer to float expansion.
15637SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15638 SelectionDAG &DAG) const {
15639 SDLoc dl(Op);
15640 // FP constant to bias correct the final result.
15641 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15642 MVT::f64);
15643
15644 // Load the 32-bit value into an XMM register.
15645 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15646 Op.getOperand(0));
15647
15648 // Zero out the upper parts of the register.
15649 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15650
15651 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15652 DAG.getBitcast(MVT::v2f64, Load),
15653 DAG.getIntPtrConstant(0, dl));
15654
15655 // Or the load with the bias.
15656 SDValue Or = DAG.getNode(
15657 ISD::OR, dl, MVT::v2i64,
15658 DAG.getBitcast(MVT::v2i64,
15659 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15660 DAG.getBitcast(MVT::v2i64,
15661 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15662 Or =
15663 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15664 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15665
15666 // Subtract the bias.
15667 // TODO: Are there any fast-math-flags to propagate here?
15668 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15669
15670 // Handle final rounding.
15671 MVT DestVT = Op.getSimpleValueType();
15672
15673 if (DestVT.bitsLT(MVT::f64))
15674 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15675 DAG.getIntPtrConstant(0, dl));
15676 if (DestVT.bitsGT(MVT::f64))
15677 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15678
15679 // Handle final rounding.
15680 return Sub;
15681}
15682
15683static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15684 const X86Subtarget &Subtarget, SDLoc &DL) {
15685 if (Op.getSimpleValueType() != MVT::v2f64)
15686 return SDValue();
15687
15688 SDValue N0 = Op.getOperand(0);
15689 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15689, __extension__ __PRETTY_FUNCTION__))
;
15690
15691 // Legalize to v4i32 type.
15692 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15693 DAG.getUNDEF(MVT::v2i32));
15694
15695 if (Subtarget.hasAVX512())
15696 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15697
15698 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15699 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15700 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15701 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15702
15703 // Two to the power of half-word-size.
15704 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15705
15706 // Clear upper part of LO, lower HI.
15707 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15708 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15709
15710 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15711 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15712 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15713
15714 // Add the two halves.
15715 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15716}
15717
15718static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15719 const X86Subtarget &Subtarget) {
15720 // The algorithm is the following:
15721 // #ifdef __SSE4_1__
15722 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15723 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15724 // (uint4) 0x53000000, 0xaa);
15725 // #else
15726 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15727 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15728 // #endif
15729 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15730 // return (float4) lo + fhi;
15731
15732 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15733 // reassociate the two FADDs, and if we do that, the algorithm fails
15734 // spectacularly (PR24512).
15735 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15736 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15737 // there's also the MachineCombiner reassociations happening on Machine IR.
15738 if (DAG.getTarget().Options.UnsafeFPMath)
15739 return SDValue();
15740
15741 SDLoc DL(Op);
15742 SDValue V = Op->getOperand(0);
15743 MVT VecIntVT = V.getSimpleValueType();
15744 bool Is128 = VecIntVT == MVT::v4i32;
15745 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15746 // If we convert to something else than the supported type, e.g., to v4f64,
15747 // abort early.
15748 if (VecFloatVT != Op->getSimpleValueType(0))
15749 return SDValue();
15750
15751 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15752, __extension__ __PRETTY_FUNCTION__))
15752 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15752, __extension__ __PRETTY_FUNCTION__))
;
15753
15754 // In the #idef/#else code, we have in common:
15755 // - The vector of constants:
15756 // -- 0x4b000000
15757 // -- 0x53000000
15758 // - A shift:
15759 // -- v >> 16
15760
15761 // Create the splat vector for 0x4b000000.
15762 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15763 // Create the splat vector for 0x53000000.
15764 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15765
15766 // Create the right shift.
15767 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15768 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15769
15770 SDValue Low, High;
15771 if (Subtarget.hasSSE41()) {
15772 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15773 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15774 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15775 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15776 // Low will be bitcasted right away, so do not bother bitcasting back to its
15777 // original type.
15778 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15779 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15780 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15781 // (uint4) 0x53000000, 0xaa);
15782 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15783 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15784 // High will be bitcasted right away, so do not bother bitcasting back to
15785 // its original type.
15786 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15787 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15788 } else {
15789 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15790 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15791 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15792 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15793
15794 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15795 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15796 }
15797
15798 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15799 SDValue VecCstFAdd = DAG.getConstantFP(
15800 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15801
15802 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15803 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15804 // TODO: Are there any fast-math-flags to propagate here?
15805 SDValue FHigh =
15806 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15807 // return (float4) lo + fhi;
15808 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15809 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15810}
15811
15812SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15813 SelectionDAG &DAG) const {
15814 SDValue N0 = Op.getOperand(0);
15815 MVT SrcVT = N0.getSimpleValueType();
15816 SDLoc dl(Op);
15817
15818 if (SrcVT.getVectorElementType() == MVT::i1) {
15819 if (SrcVT == MVT::v2i1)
15820 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15821 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15822 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15823 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15824 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15825 }
15826
15827 switch (SrcVT.SimpleTy) {
15828 default:
15829 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15829)
;
15830 case MVT::v2i32:
15831 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15832 case MVT::v4i32:
15833 case MVT::v8i32:
15834 assert(!Subtarget.hasAVX512())(static_cast <bool> (!Subtarget.hasAVX512()) ? void (0)
: __assert_fail ("!Subtarget.hasAVX512()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15834, __extension__ __PRETTY_FUNCTION__))
;
15835 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15836 }
15837}
15838
15839SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15840 SelectionDAG &DAG) const {
15841 SDValue N0 = Op.getOperand(0);
15842 SDLoc dl(Op);
15843 auto PtrVT = getPointerTy(DAG.getDataLayout());
15844
15845 if (Op.getSimpleValueType().isVector())
15846 return lowerUINT_TO_FP_vec(Op, DAG);
15847
15848 MVT SrcVT = N0.getSimpleValueType();
15849 MVT DstVT = Op.getSimpleValueType();
15850
15851 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15852 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15853 // Conversions from unsigned i32 to f32/f64 are legal,
15854 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15855 return Op;
15856 }
15857
15858 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15859 return LowerUINT_TO_FP_i64(Op, DAG);
15860 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15861 return LowerUINT_TO_FP_i32(Op, DAG);
15862 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15863 return SDValue();
15864
15865 // Make a 64-bit buffer, and use it to build an FILD.
15866 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15867 if (SrcVT == MVT::i32) {
15868 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15869 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15870 StackSlot, MachinePointerInfo());
15871 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15872 OffsetSlot, MachinePointerInfo());
15873 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15874 return Fild;
15875 }
15876
15877 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15877, __extension__ __PRETTY_FUNCTION__))
;
15878 SDValue ValueToStore = Op.getOperand(0);
15879 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15880 // Bitcasting to f64 here allows us to do a single 64-bit store from
15881 // an SSE register, avoiding the store forwarding penalty that would come
15882 // with two 32-bit stores.
15883 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15884 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15885 MachinePointerInfo());
15886 // For i64 source, we need to add the appropriate power of 2 if the input
15887 // was negative. This is the same as the optimization in
15888 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15889 // we must be careful to do the computation in x87 extended precision, not
15890 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15891 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15892 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15893 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15894 MachineMemOperand::MOLoad, 8, 8);
15895
15896 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15897 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15898 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15899 MVT::i64, MMO);
15900
15901 APInt FF(32, 0x5F800000ULL);
15902
15903 // Check whether the sign bit is set.
15904 SDValue SignSet = DAG.getSetCC(
15905 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15906 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15907
15908 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15909 SDValue FudgePtr = DAG.getConstantPool(
15910 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15911
15912 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15913 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15914 SDValue Four = DAG.getIntPtrConstant(4, dl);
15915 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15916 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15917
15918 // Load the value out, extending it from f32 to f80.
15919 // FIXME: Avoid the extend by constructing the right constant pool?
15920 SDValue Fudge = DAG.getExtLoad(
15921 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15922 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15923 /* Alignment = */ 4);
15924 // Extend everything to 80 bits to force it to be done on x87.
15925 // TODO: Are there any fast-math-flags to propagate here?
15926 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15927 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15928 DAG.getIntPtrConstant(0, dl));
15929}
15930
15931// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15932// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15933// just return an <SDValue(), SDValue()> pair.
15934// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15935// to i16, i32 or i64, and we lower it to a legal sequence.
15936// If lowered to the final integer result we return a <result, SDValue()> pair.
15937// Otherwise we lower it to a sequence ending with a FIST, return a
15938// <FIST, StackSlot> pair, and the caller is responsible for loading
15939// the final integer result from StackSlot.
15940std::pair<SDValue,SDValue>
15941X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15942 bool IsSigned, bool IsReplace) const {
15943 SDLoc DL(Op);
15944
15945 EVT DstTy = Op.getValueType();
15946 EVT TheVT = Op.getOperand(0).getValueType();
15947 auto PtrVT = getPointerTy(DAG.getDataLayout());
15948
15949 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15950 // f16 must be promoted before using the lowering in this routine.
15951 // fp128 does not use this lowering.
15952 return std::make_pair(SDValue(), SDValue());
15953 }
15954
15955 // If using FIST to compute an unsigned i64, we'll need some fixup
15956 // to handle values above the maximum signed i64. A FIST is always
15957 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15958 bool UnsignedFixup = !IsSigned &&
15959 DstTy == MVT::i64 &&
15960 (!Subtarget.is64Bit() ||
15961 !isScalarFPTypeInSSEReg(TheVT));
15962
15963 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15964 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15965 // The low 32 bits of the fist result will have the correct uint32 result.
15966 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15966, __extension__ __PRETTY_FUNCTION__))
;
15967 DstTy = MVT::i64;
15968 }
15969
15970 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__))
15971 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__))
15972 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15972, __extension__ __PRETTY_FUNCTION__))
;
15973
15974 // These are really Legal.
15975 if (DstTy == MVT::i32 &&
15976 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15977 return std::make_pair(SDValue(), SDValue());
15978 if (Subtarget.is64Bit() &&
15979 DstTy == MVT::i64 &&
15980 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15981 return std::make_pair(SDValue(), SDValue());
15982
15983 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15984 // stack slot.
15985 MachineFunction &MF = DAG.getMachineFunction();
15986 unsigned MemSize = DstTy.getSizeInBits()/8;
15987 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15988 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15989
15990 unsigned Opc;
15991 switch (DstTy.getSimpleVT().SimpleTy) {
15992 default: llvm_unreachable("Invalid FP_TO_SINT to lower!")::llvm::llvm_unreachable_internal("Invalid FP_TO_SINT to lower!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 15992)
;
15993 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15994 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15995 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15996 }
15997
15998 SDValue Chain = DAG.getEntryNode();
15999 SDValue Value = Op.getOperand(0);
16000 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16001
16002 if (UnsignedFixup) {
16003 //
16004 // Conversion to unsigned i64 is implemented with a select,
16005 // depending on whether the source value fits in the range
16006 // of a signed i64. Let Thresh be the FP equivalent of
16007 // 0x8000000000000000ULL.
16008 //
16009 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16010 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16011 // Fist-to-mem64 FistSrc
16012 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16013 // to XOR'ing the high 32 bits with Adjust.
16014 //
16015 // Being a power of 2, Thresh is exactly representable in all FP formats.
16016 // For X87 we'd like to use the smallest FP type for this constant, but
16017 // for DAG type consistency we have to match the FP operand type.
16018
16019 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16020 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
16021 bool LosesInfo = false;
16022 if (TheVT == MVT::f64)
16023 // The rounding mode is irrelevant as the conversion should be exact.
16024 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16025 &LosesInfo);
16026 else if (TheVT == MVT::f80)
16027 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16028 APFloat::rmNearestTiesToEven, &LosesInfo);
16029
16030 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16031, __extension__ __PRETTY_FUNCTION__))
16031 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16031, __extension__ __PRETTY_FUNCTION__))
;
16032
16033 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16034
16035 SDValue Cmp = DAG.getSetCC(DL,
16036 getSetCCResultType(DAG.getDataLayout(),
16037 *DAG.getContext(), TheVT),
16038 Value, ThreshVal, ISD::SETLT);
16039 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16040 DAG.getConstant(0, DL, MVT::i32),
16041 DAG.getConstant(0x80000000, DL, MVT::i32));
16042 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16043 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16044 *DAG.getContext(), TheVT),
16045 Value, ThreshVal, ISD::SETLT);
16046 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16047 }
16048
16049 // FIXME This causes a redundant load/store if the SSE-class value is already
16050 // in memory, such as if it is on the callstack.
16051 if (isScalarFPTypeInSSEReg(TheVT)) {
16052 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16052, __extension__ __PRETTY_FUNCTION__))
;
16053 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16054 MachinePointerInfo::getFixedStack(MF, SSFI));
16055 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16056 SDValue Ops[] = {
16057 Chain, StackSlot, DAG.getValueType(TheVT)
16058 };
16059
16060 MachineMemOperand *MMO =
16061 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16062 MachineMemOperand::MOLoad, MemSize, MemSize);
16063 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16064 Chain = Value.getValue(1);
16065 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16066 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16067 }
16068
16069 MachineMemOperand *MMO =
16070 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16071 MachineMemOperand::MOStore, MemSize, MemSize);
16072
16073 if (UnsignedFixup) {
16074
16075 // Insert the FIST, load its result as two i32's,
16076 // and XOR the high i32 with Adjust.
16077
16078 SDValue FistOps[] = { Chain, Value, StackSlot };
16079 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16080 FistOps, DstTy, MMO);
16081
16082 SDValue Low32 =
16083 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16084 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16085
16086 SDValue High32 =
16087 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16088 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16089
16090 if (Subtarget.is64Bit()) {
16091 // Join High32 and Low32 into a 64-bit result.
16092 // (High32 << 32) | Low32
16093 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16094 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16095 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16096 DAG.getConstant(32, DL, MVT::i8));
16097 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16098 return std::make_pair(Result, SDValue());
16099 }
16100
16101 SDValue ResultOps[] = { Low32, High32 };
16102
16103 SDValue pair = IsReplace
16104 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16105 : DAG.getMergeValues(ResultOps, DL);
16106 return std::make_pair(pair, SDValue());
16107 } else {
16108 // Build the FP_TO_INT*_IN_MEM
16109 SDValue Ops[] = { Chain, Value, StackSlot };
16110 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16111 Ops, DstTy, MMO);
16112 return std::make_pair(FIST, StackSlot);
16113 }
16114}
16115
16116static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16117 const X86Subtarget &Subtarget) {
16118 MVT VT = Op->getSimpleValueType(0);
16119 SDValue In = Op->getOperand(0);
16120 MVT InVT = In.getSimpleValueType();
16121 SDLoc dl(Op);
16122
16123 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16124 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16125 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
16126 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
16127 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
16128 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
16129 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
16130 (VT != MVT::v32i16 || InVT != MVT::v32i8))
16131 return SDValue();
16132
16133 if (Subtarget.hasInt256())
16134 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16135
16136 // Optimize vectors in AVX mode:
16137 //
16138 // v8i16 -> v8i32
16139 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16140 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16141 // Concat upper and lower parts.
16142 //
16143 // v4i32 -> v4i64
16144 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16145 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16146 // Concat upper and lower parts.
16147 //
16148
16149 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16150 SDValue Undef = DAG.getUNDEF(InVT);
16151 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16152 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16153 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16154
16155 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16156 VT.getVectorNumElements()/2);
16157
16158 OpLo = DAG.getBitcast(HVT, OpLo);
16159 OpHi = DAG.getBitcast(HVT, OpHi);
16160
16161 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16162}
16163
16164static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16165 const X86Subtarget &Subtarget,
16166 SelectionDAG &DAG) {
16167 MVT VT = Op->getSimpleValueType(0);
16168 SDValue In = Op->getOperand(0);
16169 MVT InVT = In.getSimpleValueType();
16170 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16170, __extension__ __PRETTY_FUNCTION__))
;
16171 SDLoc DL(Op);
16172 unsigned NumElts = VT.getVectorNumElements();
16173
16174 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
16175 MVT ExtVT = VT;
16176 if (!Subtarget.hasBWI() &&
16177 (VT.getVectorElementType().getSizeInBits() <= 16))
16178 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16179
16180 // Widen to 512-bits if VLX is not supported.
16181 MVT WideVT = ExtVT;
16182 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16183 NumElts *= 512 / ExtVT.getSizeInBits();
16184 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16185 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16186 In, DAG.getIntPtrConstant(0, DL));
16187 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16188 NumElts);
16189 }
16190
16191 SDValue One = DAG.getConstant(1, DL, WideVT);
16192 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16193
16194 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16195
16196 // Truncate if we had to extend i16/i8 above.
16197 if (VT != ExtVT) {
16198 WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16199 SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal);
16200 }
16201
16202 // Extract back to 128/256-bit if we widened.
16203 if (WideVT != VT)
16204 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16205 DAG.getIntPtrConstant(0, DL));
16206
16207 return SelectedVal;
16208}
16209
16210static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16211 SelectionDAG &DAG) {
16212 SDValue In = Op->getOperand(0);
16213 MVT InVT = In.getSimpleValueType();
16214
16215 if (InVT.getVectorElementType() == MVT::i1)
16216 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16217
16218 if (Subtarget.hasFp256())
16219 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16220 return Res;
16221
16222 return SDValue();
16223}
16224
16225static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16226 SelectionDAG &DAG) {
16227 SDValue In = Op.getOperand(0);
16228 MVT SVT = In.getSimpleValueType();
16229
16230 if (SVT.getVectorElementType() == MVT::i1)
16231 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16232
16233 if (Subtarget.hasFp256())
16234 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16235 return Res;
16236
16237 assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__))
16238 Op.getSimpleValueType().getVectorNumElements() !=(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__))
16239 SVT.getVectorNumElements())(static_cast <bool> (!Op.getSimpleValueType().is256BitVector
() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements
() != SVT.getVectorNumElements()) ? void (0) : __assert_fail (
"!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16239, __extension__ __PRETTY_FUNCTION__))
;
16240 return SDValue();
16241}
16242
16243/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16244/// It makes use of the fact that vectors with enough leading sign/zero bits
16245/// prevent the PACKSS/PACKUS from saturating the results.
16246/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16247/// within each 128-bit lane.
16248static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16249 const SDLoc &DL, SelectionDAG &DAG,
16250 const X86Subtarget &Subtarget) {
16251 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16252, __extension__ __PRETTY_FUNCTION__))
16252 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16252, __extension__ __PRETTY_FUNCTION__))
;
16253
16254 // Requires SSE2 but AVX512 has fast truncate.
16255 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16256 return SDValue();
16257
16258 EVT SrcVT = In.getValueType();
16259
16260 // No truncation required, we might get here due to recursive calls.
16261 if (SrcVT == DstVT)
16262 return In;
16263
16264 // We only support vector truncation to 128bits or greater from a
16265 // 256bits or greater source.
16266 unsigned DstSizeInBits = DstVT.getSizeInBits();
16267 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16268 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16269 return SDValue();
16270
16271 LLVMContext &Ctx = *DAG.getContext();
16272 unsigned NumElems = SrcVT.getVectorNumElements();
16273 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16273, __extension__ __PRETTY_FUNCTION__))
;
16274 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16274, __extension__ __PRETTY_FUNCTION__))
;
16275
16276 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16277
16278 // Extract lower/upper subvectors.
16279 unsigned NumSubElts = NumElems / 2;
16280 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16281 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16282
16283 // Pack to the largest type possible:
16284 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16285 EVT InVT = MVT::i16, OutVT = MVT::i8;
16286 if (DstVT.getScalarSizeInBits() > 8 &&
16287 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16288 InVT = MVT::i32;
16289 OutVT = MVT::i16;
16290 }
16291
16292 unsigned SubSizeInBits = SrcSizeInBits / 2;
16293 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16294 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16295
16296 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16297 if (SrcVT.is256BitVector()) {
16298 Lo = DAG.getBitcast(InVT, Lo);
16299 Hi = DAG.getBitcast(InVT, Hi);
16300 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16301 return DAG.getBitcast(DstVT, Res);
16302 }
16303
16304 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16305 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16306 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16307 Lo = DAG.getBitcast(InVT, Lo);
16308 Hi = DAG.getBitcast(InVT, Hi);
16309 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16310
16311 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16312 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16313 Res = DAG.getBitcast(MVT::v4i64, Res);
16314 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16315
16316 if (DstVT.is256BitVector())
16317 return DAG.getBitcast(DstVT, Res);
16318
16319 // If 512bit -> 128bit truncate another stage.
16320 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16321 Res = DAG.getBitcast(PackedVT, Res);
16322 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16323 }
16324
16325 // Recursively pack lower/upper subvectors, concat result and pack again.
16326 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 512 &&
"Expected 512-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 512 && \"Expected 512-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16326, __extension__ __PRETTY_FUNCTION__))
;
16327 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16328 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16329 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16330
16331 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16332 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16333 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16334}
16335
16336static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16337 const X86Subtarget &Subtarget) {
16338
16339 SDLoc DL(Op);
16340 MVT VT = Op.getSimpleValueType();
16341 SDValue In = Op.getOperand(0);
16342 MVT InVT = In.getSimpleValueType();
16343
16344 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16344, __extension__ __PRETTY_FUNCTION__))
;
16345
16346 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16347 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16348 if (InVT.getScalarSizeInBits() <= 16) {
16349 if (Subtarget.hasBWI()) {
16350 // legal, will go to VPMOVB2M, VPMOVW2M
16351 // Shift packed bytes not supported natively, bitcast to word
16352 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16353 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16354 DAG.getBitcast(ExtVT, In),
16355 DAG.getConstant(ShiftInx, DL, ExtVT));
16356 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16357 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16358 }
16359 // Use TESTD/Q, extended vector to packed dword/qword.
16360 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16361, __extension__ __PRETTY_FUNCTION__))
16361 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16361, __extension__ __PRETTY_FUNCTION__))
;
16362 unsigned NumElts = InVT.getVectorNumElements();
16363 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16364 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16365 InVT = ExtVT;
16366 ShiftInx = InVT.getScalarSizeInBits() - 1;
16367 }
16368
16369 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16370 DAG.getConstant(ShiftInx, DL, InVT));
16371 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16372}
16373
16374SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16375 SDLoc DL(Op);
16376 MVT VT = Op.getSimpleValueType();
16377 SDValue In = Op.getOperand(0);
16378 MVT InVT = In.getSimpleValueType();
16379 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16380
16381 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16382, __extension__ __PRETTY_FUNCTION__))
16382 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16382, __extension__ __PRETTY_FUNCTION__))
;
16383
16384 if (VT.getVectorElementType() == MVT::i1)
16385 return LowerTruncateVecI1(Op, DAG, Subtarget);
16386
16387 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16388 if (Subtarget.hasAVX512()) {
16389 // word to byte only under BWI
16390 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16391 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16392 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16393 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16394 }
16395
16396 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16397 // extend all the way to the packed/truncated value.
16398 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16399 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16400 if (SDValue V =
16401 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16402 return V;
16403
16404 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16405 // that extend all the way to the packed/truncated value.
16406 // Pre-SSE41 we can only use PACKUSWB.
16407 KnownBits Known;
16408 DAG.computeKnownBits(In, Known);
16409 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16410 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16411 if (SDValue V =
16412 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16413 return V;
16414
16415 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16416 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16417 if (Subtarget.hasInt256()) {
16418 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16419 In = DAG.getBitcast(MVT::v8i32, In);
16420 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16422 DAG.getIntPtrConstant(0, DL));
16423 }
16424
16425 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16426 DAG.getIntPtrConstant(0, DL));
16427 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16428 DAG.getIntPtrConstant(2, DL));
16429 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16430 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16431 static const int ShufMask[] = {0, 2, 4, 6};
16432 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16433 }
16434
16435 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16436 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16437 if (Subtarget.hasInt256()) {
16438 In = DAG.getBitcast(MVT::v32i8, In);
16439
16440 // The PSHUFB mask:
16441 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16442 -1, -1, -1, -1, -1, -1, -1, -1,
16443 16, 17, 20, 21, 24, 25, 28, 29,
16444 -1, -1, -1, -1, -1, -1, -1, -1 };
16445 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16446 In = DAG.getBitcast(MVT::v4i64, In);
16447
16448 static const int ShufMask2[] = {0, 2, -1, -1};
16449 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16450 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16451 DAG.getIntPtrConstant(0, DL));
16452 return DAG.getBitcast(VT, In);
16453 }
16454
16455 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16456 DAG.getIntPtrConstant(0, DL));
16457
16458 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16459 DAG.getIntPtrConstant(4, DL));
16460
16461 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16462 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16463
16464 // The PSHUFB mask:
16465 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16466 -1, -1, -1, -1, -1, -1, -1, -1};
16467
16468 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16469 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16470
16471 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16472 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16473
16474 // The MOVLHPS Mask:
16475 static const int ShufMask2[] = {0, 1, 4, 5};
16476 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16477 return DAG.getBitcast(MVT::v8i16, res);
16478 }
16479
16480 // Handle truncation of V256 to V128 using shuffles.
16481 if (!VT.is128BitVector() || !InVT.is256BitVector())
16482 return SDValue();
16483
16484 assert(Subtarget.hasFp256() && "256-bit vector without AVX!")(static_cast <bool> (Subtarget.hasFp256() && "256-bit vector without AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasFp256() && \"256-bit vector without AVX!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16484, __extension__ __PRETTY_FUNCTION__))
;
16485
16486 unsigned NumElems = VT.getVectorNumElements();
16487 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16488
16489 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16490 // Prepare truncation shuffle mask
16491 for (unsigned i = 0; i != NumElems; ++i)
16492 MaskVec[i] = i * 2;
16493 In = DAG.getBitcast(NVT, In);
16494 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16495 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16496 DAG.getIntPtrConstant(0, DL));
16497}
16498
16499SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16500 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16501 MVT VT = Op.getSimpleValueType();
16502
16503 if (VT.isVector()) {
16504 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL!") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16504, __extension__ __PRETTY_FUNCTION__))
;
16505 SDValue Src = Op.getOperand(0);
16506 SDLoc dl(Op);
16507 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16508 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16509 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16510 DAG.getUNDEF(MVT::v2f32)));
16511 }
16512
16513 return SDValue();
16514 }
16515
16516 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16516, __extension__ __PRETTY_FUNCTION__))
;
16517
16518 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16519 IsSigned, /*IsReplace=*/ false);
16520 SDValue FIST = Vals.first, StackSlot = Vals.second;
16521 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16522 if (!FIST.getNode())
16523 return Op;
16524
16525 if (StackSlot.getNode())
16526 // Load the result.
16527 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16528
16529 // The node is the result.
16530 return FIST;
16531}
16532
16533static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16534 SDLoc DL(Op);
16535 MVT VT = Op.getSimpleValueType();
16536 SDValue In = Op.getOperand(0);
16537 MVT SVT = In.getSimpleValueType();
16538
16539 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16539, __extension__ __PRETTY_FUNCTION__))
;
16540
16541 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16542 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16543 In, DAG.getUNDEF(SVT)));
16544}
16545
16546/// The only differences between FABS and FNEG are the mask and the logic op.
16547/// FNEG also has a folding opportunity for FNEG(FABS(x)).
16548static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16549 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16550, __extension__ __PRETTY_FUNCTION__))
16550 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16550, __extension__ __PRETTY_FUNCTION__))
;
16551
16552 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16553
16554 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16555 // into an FNABS. We'll lower the FABS after that if it is still in use.
16556 if (IsFABS)
16557 for (SDNode *User : Op->uses())
16558 if (User->getOpcode() == ISD::FNEG)
16559 return Op;
16560
16561 SDLoc dl(Op);
16562 MVT VT = Op.getSimpleValueType();
16563
16564 bool IsF128 = (VT == MVT::f128);
16565
16566 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16567 // decide if we should generate a 16-byte constant mask when we only need 4 or
16568 // 8 bytes for the scalar case.
16569
16570 MVT LogicVT;
16571 MVT EltVT;
16572
16573 if (VT.isVector()) {
16574 LogicVT = VT;
16575 EltVT = VT.getVectorElementType();
16576 } else if (IsF128) {
16577 // SSE instructions are used for optimized f128 logical operations.
16578 LogicVT = MVT::f128;
16579 EltVT = VT;
16580 } else {
16581 // There are no scalar bitwise logical SSE/AVX instructions, so we
16582 // generate a 16-byte vector constant and logic op even for the scalar case.
16583 // Using a 16-byte mask allows folding the load of the mask with
16584 // the logic op, so it can save (~4 bytes) on code size.
16585 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16586 EltVT = VT;
16587 }
16588
16589 unsigned EltBits = EltVT.getSizeInBits();
16590 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16591 APInt MaskElt =
16592 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16593 const fltSemantics &Sem =
16594 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16595 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16596 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16597
16598 SDValue Op0 = Op.getOperand(0);
16599 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16600 unsigned LogicOp =
16601 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16602 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16603
16604 if (VT.isVector() || IsF128)
16605 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16606
16607 // For the scalar case extend to a 128-bit vector, perform the logic op,
16608 // and extract the scalar result back out.
16609 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16610 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16611 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16612 DAG.getIntPtrConstant(0, dl));
16613}
16614
16615static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16616 SDValue Mag = Op.getOperand(0);
16617 SDValue Sign = Op.getOperand(1);
16618 SDLoc dl(Op);
16619
16620 // If the sign operand is smaller, extend it first.
16621 MVT VT = Op.getSimpleValueType();
16622 if (Sign.getSimpleValueType().bitsLT(VT))
16623 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16624
16625 // And if it is bigger, shrink it first.
16626 if (Sign.getSimpleValueType().bitsGT(VT))
16627 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16628
16629 // At this point the operands and the result should have the same
16630 // type, and that won't be f80 since that is not custom lowered.
16631 bool IsF128 = (VT == MVT::f128);
16632 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))
16633 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))
16634 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))
16635 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> ((VT == MVT::f64 || VT == MVT::f32 ||
VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT
== MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT ==
MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN")
? void (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16635, __extension__ __PRETTY_FUNCTION__))
;
16636
16637 MVT EltVT = VT.getScalarType();
16638 const fltSemantics &Sem =
16639 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16640 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16641
16642 // Perform all scalar logic operations as 16-byte vectors because there are no
16643 // scalar FP logic instructions in SSE.
16644 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16645 // unnecessary splats, but we might miss load folding opportunities. Should
16646 // this decision be based on OptimizeForSize?
16647 bool IsFakeVector = !VT.isVector() && !IsF128;
16648 MVT LogicVT = VT;
16649 if (IsFakeVector)
16650 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16651
16652 // The mask constants are automatically splatted for vector types.
16653 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16654 SDValue SignMask = DAG.getConstantFP(
16655 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16656 SDValue MagMask = DAG.getConstantFP(
16657 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16658
16659 // First, clear all bits but the sign bit from the second operand (sign).
16660 if (IsFakeVector)
16661 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16662 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16663
16664 // Next, clear the sign bit from the first operand (magnitude).
16665 // TODO: If we had general constant folding for FP logic ops, this check
16666 // wouldn't be necessary.
16667 SDValue MagBits;
16668 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16669 APFloat APF = Op0CN->getValueAPF();
16670 APF.clearSign();
16671 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16672 } else {
16673 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16674 if (IsFakeVector)
16675 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16676 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16677 }
16678
16679 // OR the magnitude value with the sign bit.
16680 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16681 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16682 DAG.getIntPtrConstant(0, dl));
16683}
16684
16685static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16686 SDValue N0 = Op.getOperand(0);
16687 SDLoc dl(Op);
16688 MVT VT = Op.getSimpleValueType();
16689
16690 MVT OpVT = N0.getSimpleValueType();
16691 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16692, __extension__ __PRETTY_FUNCTION__))
16692 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16692, __extension__ __PRETTY_FUNCTION__))
;
16693
16694 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16695 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16696 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16697 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16698 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16699 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16700 return Res;
16701}
16702
16703// Check whether an OR'd tree is PTEST-able.
16704static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16705 SelectionDAG &DAG) {
16706 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")(static_cast <bool> (Op.getOpcode() == ISD::OR &&
"Only check OR'd tree.") ? void (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16706, __extension__ __PRETTY_FUNCTION__))
;
16707
16708 if (!Subtarget.hasSSE41())
16709 return SDValue();
16710
16711 if (!Op->hasOneUse())
16712 return SDValue();
16713
16714 SDNode *N = Op.getNode();
16715 SDLoc DL(N);
16716
16717 SmallVector<SDValue, 8> Opnds;
16718 DenseMap<SDValue, unsigned> VecInMap;
16719 SmallVector<SDValue, 8> VecIns;
16720 EVT VT = MVT::Other;
16721
16722 // Recognize a special case where a vector is casted into wide integer to
16723 // test all 0s.
16724 Opnds.push_back(N->getOperand(0));
16725 Opnds.push_back(N->getOperand(1));
16726
16727 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16728 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16729 // BFS traverse all OR'd operands.
16730 if (I->getOpcode() == ISD::OR) {
16731 Opnds.push_back(I->getOperand(0));
16732 Opnds.push_back(I->getOperand(1));
16733 // Re-evaluate the number of nodes to be traversed.
16734 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16735 continue;
16736 }
16737
16738 // Quit if a non-EXTRACT_VECTOR_ELT
16739 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16740 return SDValue();
16741
16742 // Quit if without a constant index.
16743 SDValue Idx = I->getOperand(1);
16744 if (!isa<ConstantSDNode>(Idx))
16745 return SDValue();
16746
16747 SDValue ExtractedFromVec = I->getOperand(0);
16748 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16749 if (M == VecInMap.end()) {
16750 VT = ExtractedFromVec.getValueType();
16751 // Quit if not 128/256-bit vector.
16752 if (!VT.is128BitVector() && !VT.is256BitVector())
16753 return SDValue();
16754 // Quit if not the same type.
16755 if (VecInMap.begin() != VecInMap.end() &&
16756 VT != VecInMap.begin()->first.getValueType())
16757 return SDValue();
16758 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16759 VecIns.push_back(ExtractedFromVec);
16760 }
16761 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16762 }
16763
16764 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Not extracted from 128-/256-bit vector.") ? void
(0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16765, __extension__ __PRETTY_FUNCTION__))
16765 "Not extracted from 128-/256-bit vector.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Not extracted from 128-/256-bit vector.") ? void
(0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Not extracted from 128-/256-bit vector.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16765, __extension__ __PRETTY_FUNCTION__))
;
16766
16767 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16768
16769 for (DenseMap<SDValue, unsigned>::const_iterator
16770 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16771 // Quit if not all elements are used.
16772 if (I->second != FullMask)
16773 return SDValue();
16774 }
16775
16776 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16777
16778 // Cast all vectors into TestVT for PTEST.
16779 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16780 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16781
16782 // If more than one full vector is evaluated, OR them first before PTEST.
16783 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16784 // Each iteration will OR 2 nodes and append the result until there is only
16785 // 1 node left, i.e. the final OR'd value of all vectors.
16786 SDValue LHS = VecIns[Slot];
16787 SDValue RHS = VecIns[Slot + 1];
16788 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16789 }
16790
16791 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16792}
16793
16794/// \brief return true if \c Op has a use that doesn't just read flags.
16795static bool hasNonFlagsUse(SDValue Op) {
16796 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16797 ++UI) {
16798 SDNode *User = *UI;
16799 unsigned UOpNo = UI.getOperandNo();
16800 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16801 // Look pass truncate.
16802 UOpNo = User->use_begin().getOperandNo();
16803 User = *User->use_begin();
16804 }
16805
16806 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16807 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16808 return true;
16809 }
16810 return false;
16811}
16812
16813// Emit KTEST instruction for bit vectors on AVX-512
16814static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16815 const X86Subtarget &Subtarget) {
16816 if (Op.getOpcode() == ISD::BITCAST) {
16817 auto hasKTEST = [&](MVT VT) {
16818 unsigned SizeInBits = VT.getSizeInBits();
16819 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16820 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16821 };
16822 SDValue Op0 = Op.getOperand(0);
16823 MVT Op0VT = Op0.getValueType().getSimpleVT();
16824 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16825 hasKTEST(Op0VT))
16826 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16827 }
16828 return SDValue();
16829}
16830
16831/// Emit nodes that will be selected as "test Op0,Op0", or something
16832/// equivalent.
16833SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16834 SelectionDAG &DAG) const {
16835 if (Op.getValueType() == MVT::i1) {
16836 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16837 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16838 DAG.getConstant(0, dl, MVT::i8));
16839 }
16840 // CF and OF aren't always set the way we want. Determine which
16841 // of these we need.
16842 bool NeedCF = false;
16843 bool NeedOF = false;
16844 switch (X86CC) {
16845 default: break;
16846 case X86::COND_A: case X86::COND_AE:
16847 case X86::COND_B: case X86::COND_BE:
16848 NeedCF = true;
16849 break;
16850 case X86::COND_G: case X86::COND_GE:
16851 case X86::COND_L: case X86::COND_LE:
16852 case X86::COND_O: case X86::COND_NO: {
16853 // Check if we really need to set the
16854 // Overflow flag. If NoSignedWrap is present
16855 // that is not actually needed.
16856 switch (Op->getOpcode()) {
16857 case ISD::ADD:
16858 case ISD::SUB:
16859 case ISD::MUL:
16860 case ISD::SHL:
16861 if (Op.getNode()->getFlags().hasNoSignedWrap())
16862 break;
16863 LLVM_FALLTHROUGH[[clang::fallthrough]];
16864 default:
16865 NeedOF = true;
16866 break;
16867 }
16868 break;
16869 }
16870 }
16871 // See if we can use the EFLAGS value from the operand instead of
16872 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16873 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16874 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16875 // Emit KTEST for bit vectors
16876 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16877 return Node;
16878 // Emit a CMP with 0, which is the TEST pattern.
16879 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16880 DAG.getConstant(0, dl, Op.getValueType()));
16881 }
16882 unsigned Opcode = 0;
16883 unsigned NumOperands = 0;
16884
16885 // Truncate operations may prevent the merge of the SETCC instruction
16886 // and the arithmetic instruction before it. Attempt to truncate the operands
16887 // of the arithmetic instruction and use a reduced bit-width instruction.
16888 bool NeedTruncation = false;
16889 SDValue ArithOp = Op;
16890 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16891 SDValue Arith = Op->getOperand(0);
16892 // Both the trunc and the arithmetic op need to have one user each.
16893 if (Arith->hasOneUse())
16894 switch (Arith.getOpcode()) {
16895 default: break;
16896 case ISD::ADD:
16897 case ISD::SUB:
16898 case ISD::AND:
16899 case ISD::OR:
16900 case ISD::XOR: {
16901 NeedTruncation = true;
16902 ArithOp = Arith;
16903 }
16904 }
16905 }
16906
16907 // Sometimes flags can be set either with an AND or with an SRL/SHL
16908 // instruction. SRL/SHL variant should be preferred for masks longer than this
16909 // number of bits.
16910 const int ShiftToAndMaxMaskWidth = 32;
16911 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16912
16913 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16914 // which may be the result of a CAST. We use the variable 'Op', which is the
16915 // non-casted variable when we check for possible users.
16916 switch (ArithOp.getOpcode()) {
16917 case ISD::ADD:
16918 // We only want to rewrite this as a target-specific node with attached
16919 // flags if there is a reasonable chance of either using that to do custom
16920 // instructions selection that can fold some of the memory operands, or if
16921 // only the flags are used. If there are other uses, leave the node alone
16922 // and emit a test instruction.
16923 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16924 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16925 if (UI->getOpcode() != ISD::CopyToReg &&
16926 UI->getOpcode() != ISD::SETCC &&
16927 UI->getOpcode() != ISD::STORE)
16928 goto default_case;
16929
16930 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16931 // An add of one will be selected as an INC.
16932 if (C->isOne() &&
16933 (!Subtarget.slowIncDec() ||
16934 DAG.getMachineFunction().getFunction()->optForSize())) {
16935 Opcode = X86ISD::INC;
16936 NumOperands = 1;
16937 break;
16938 }
16939
16940 // An add of negative one (subtract of one) will be selected as a DEC.
16941 if (C->isAllOnesValue() &&
16942 (!Subtarget.slowIncDec() ||
16943 DAG.getMachineFunction().getFunction()->optForSize())) {
16944 Opcode = X86ISD::DEC;
16945 NumOperands = 1;
16946 break;
16947 }
16948 }
16949
16950 // Otherwise use a regular EFLAGS-setting add.
16951 Opcode = X86ISD::ADD;
16952 NumOperands = 2;
16953 break;
16954 case ISD::SHL:
16955 case ISD::SRL:
16956 // If we have a constant logical shift that's only used in a comparison
16957 // against zero turn it into an equivalent AND. This allows turning it into
16958 // a TEST instruction later.
16959 if (ZeroCheck && Op->hasOneUse() &&
16960 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16961 EVT VT = Op.getValueType();
16962 unsigned BitWidth = VT.getSizeInBits();
16963 unsigned ShAmt = Op->getConstantOperandVal(1);
16964 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16965 break;
16966 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16967 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16968 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16969 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16970 break;
16971 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16972 DAG.getConstant(Mask, dl, VT));
16973 }
16974 break;
16975
16976 case ISD::AND:
16977 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16978 // because a TEST instruction will be better. However, AND should be
16979 // preferred if the instruction can be combined into ANDN.
16980 if (!hasNonFlagsUse(Op)) {
16981 SDValue Op0 = ArithOp->getOperand(0);
16982 SDValue Op1 = ArithOp->getOperand(1);
16983 EVT VT = ArithOp.getValueType();
16984 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16985 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16986 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16987
16988 // If we cannot select an ANDN instruction, check if we can replace
16989 // AND+IMM64 with a shift before giving up. This is possible for masks
16990 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16991 if (!isProperAndn) {
16992 if (!ZeroCheck)
16993 break;
16994
16995 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized")(static_cast <bool> (!isa<ConstantSDNode>(Op0) &&
"AND node isn't canonicalized") ? void (0) : __assert_fail (
"!isa<ConstantSDNode>(Op0) && \"AND node isn't canonicalized\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 16995, __extension__ __PRETTY_FUNCTION__))
;
16996 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16997 if (!CN)
16998 break;
16999
17000 const APInt &Mask = CN->getAPIntValue();
17001 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17002 break; // Prefer TEST instruction.
17003
17004 unsigned BitWidth = Mask.getBitWidth();
17005 unsigned LeadingOnes = Mask.countLeadingOnes();
17006 unsigned TrailingZeros = Mask.countTrailingZeros();
17007
17008 if (LeadingOnes + TrailingZeros == BitWidth) {
17009 assert(TrailingZeros < VT.getSizeInBits() &&(static_cast <bool> (TrailingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17010, __extension__ __PRETTY_FUNCTION__))
17010 "Shift amount should be less than the type width")(static_cast <bool> (TrailingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("TrailingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17010, __extension__ __PRETTY_FUNCTION__))
;
17011 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17012 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17013 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17014 break;
17015 }
17016
17017 unsigned LeadingZeros = Mask.countLeadingZeros();
17018 unsigned TrailingOnes = Mask.countTrailingOnes();
17019
17020 if (LeadingZeros + TrailingOnes == BitWidth) {
17021 assert(LeadingZeros < VT.getSizeInBits() &&(static_cast <bool> (LeadingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17022, __extension__ __PRETTY_FUNCTION__))
17022 "Shift amount should be less than the type width")(static_cast <bool> (LeadingZeros < VT.getSizeInBits
() && "Shift amount should be less than the type width"
) ? void (0) : __assert_fail ("LeadingZeros < VT.getSizeInBits() && \"Shift amount should be less than the type width\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17022, __extension__ __PRETTY_FUNCTION__))
;
17023 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17024 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17025 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17026 break;
17027 }
17028
17029 break;
17030 }
17031 }
17032 LLVM_FALLTHROUGH[[clang::fallthrough]];
17033 case ISD::SUB:
17034 case ISD::OR:
17035 case ISD::XOR:
17036 // Similar to ISD::ADD above, check if the uses will preclude useful
17037 // lowering of the target-specific node.
17038 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17039 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17040 if (UI->getOpcode() != ISD::CopyToReg &&
17041 UI->getOpcode() != ISD::SETCC &&
17042 UI->getOpcode() != ISD::STORE)
17043 goto default_case;
17044
17045 // Otherwise use a regular EFLAGS-setting instruction.
17046 switch (ArithOp.getOpcode()) {
17047 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17047)
;
17048 case ISD::SUB: Opcode = X86ISD::SUB; break;
17049 case ISD::XOR: Opcode = X86ISD::XOR; break;
17050 case ISD::AND: Opcode = X86ISD::AND; break;
17051 case ISD::OR: {
17052 if (!NeedTruncation && ZeroCheck) {
17053 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
17054 return EFLAGS;
17055 }
17056 Opcode = X86ISD::OR;
17057 break;
17058 }
17059 }
17060
17061 NumOperands = 2;
17062 break;
17063 case X86ISD::ADD:
17064 case X86ISD::SUB:
17065 case X86ISD::INC:
17066 case X86ISD::DEC:
17067 case X86ISD::OR:
17068 case X86ISD::XOR:
17069 case X86ISD::AND:
17070 return SDValue(Op.getNode(), 1);
17071 default:
17072 default_case:
17073 break;
17074 }
17075
17076 // If we found that truncation is beneficial, perform the truncation and
17077 // update 'Op'.
17078 if (NeedTruncation) {
17079 EVT VT = Op.getValueType();
17080 SDValue WideVal = Op->getOperand(0);
17081 EVT WideVT = WideVal.getValueType();
17082 unsigned ConvertedOp = 0;
17083 // Use a target machine opcode to prevent further DAGCombine
17084 // optimizations that may separate the arithmetic operations
17085 // from the setcc node.
17086 switch (WideVal.getOpcode()) {
17087 default: break;
17088 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17089 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17090 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17091 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17092 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17093 }
17094
17095 if (ConvertedOp) {
17096 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17097 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17098 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17099 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17100 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
17101 }
17102 }
17103 }
17104
17105 if (Opcode == 0) {
17106 // Emit KTEST for bit vectors
17107 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
17108 return Node;
17109
17110 // Emit a CMP with 0, which is the TEST pattern.
17111 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17112 DAG.getConstant(0, dl, Op.getValueType()));
17113 }
17114 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17115 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17116
17117 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17118 DAG.ReplaceAllUsesWith(Op, New);
17119 return SDValue(New.getNode(), 1);
17120}
17121
17122/// Emit nodes that will be selected as "cmp Op0,Op1", or something
17123/// equivalent.
17124SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17125 const SDLoc &dl, SelectionDAG &DAG) const {
17126 if (isNullConstant(Op1))
17127 return EmitTest(Op0, X86CC, dl, DAG);
17128
17129 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&(static_cast <bool> (!(isa<ConstantSDNode>(Op1) &&
Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"
) ? void (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17130, __extension__ __PRETTY_FUNCTION__))
17130 "Unexpected comparison operation for MVT::i1 operands")(static_cast <bool> (!(isa<ConstantSDNode>(Op1) &&
Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"
) ? void (0) : __assert_fail ("!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && \"Unexpected comparison operation for MVT::i1 operands\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17130, __extension__ __PRETTY_FUNCTION__))
;
17131
17132 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17133 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17134 // Only promote the compare up to I32 if it is a 16 bit operation
17135 // with an immediate. 16 bit immediates are to be avoided.
17136 if ((Op0.getValueType() == MVT::i16 &&
17137 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17138 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
17139 !Subtarget.isAtom()) {
17140 unsigned ExtendOp =
17141 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17142 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17143 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17144 }
17145 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17146 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17147 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17148 return SDValue(Sub.getNode(), 1);
17149 }
17150 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17151}
17152
17153/// Convert a comparison if required by the subtarget.
17154SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17155 SelectionDAG &DAG) const {
17156 // If the subtarget does not support the FUCOMI instruction, floating-point
17157 // comparisons have to be converted.
17158 if (Subtarget.hasCMov() ||
17159 Cmp.getOpcode() != X86ISD::CMP ||
17160 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17161 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17162 return Cmp;
17163
17164 // The instruction selector will select an FUCOM instruction instead of
17165 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17166 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17167 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17168 SDLoc dl(Cmp);
17169 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17170 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17171 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17172 DAG.getConstant(8, dl, MVT::i8));
17173 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17174
17175 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17176 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?")(static_cast <bool> (Subtarget.hasLAHFSAHF() &&
"Target doesn't support SAHF or FCOMI?") ? void (0) : __assert_fail
("Subtarget.hasLAHFSAHF() && \"Target doesn't support SAHF or FCOMI?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17176, __extension__ __PRETTY_FUNCTION__))
;
17177 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17178}
17179
17180/// Check if replacement of SQRT with RSQRT should be disabled.
17181bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17182 EVT VT = Op.getValueType();
17183
17184 // We never want to use both SQRT and RSQRT instructions for the same input.
17185 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17186 return false;
17187
17188 if (VT.isVector())
17189 return Subtarget.hasFastVectorFSQRT();
17190 return Subtarget.hasFastScalarFSQRT();
17191}
17192
17193/// The minimum architected relative accuracy is 2^-12. We need one
17194/// Newton-Raphson step to have a good float result (24 bits of precision).
17195SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17196 SelectionDAG &DAG, int Enabled,
17197 int &RefinementSteps,
17198 bool &UseOneConstNR,
17199 bool Reciprocal) const {
17200 EVT VT = Op.getValueType();
17201
17202 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17203 // TODO: Add support for AVX512 (v16f32).
17204 // It is likely not profitable to do this for f64 because a double-precision
17205 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17206 // instructions: convert to single, rsqrtss, convert back to double, refine
17207 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17208 // along with FMA, this could be a throughput win.
17209 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17210 // after legalize types.
17211 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17212 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17213 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17214 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17215 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17216 RefinementSteps = 1;
17217
17218 UseOneConstNR = false;
17219 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17220 }
17221 return SDValue();
17222}
17223
17224/// The minimum architected relative accuracy is 2^-12. We need one
17225/// Newton-Raphson step to have a good float result (24 bits of precision).
17226SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17227 int Enabled,
17228 int &RefinementSteps) const {
17229 EVT VT = Op.getValueType();
17230
17231 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17232 // TODO: Add support for AVX512 (v16f32).
17233 // It is likely not profitable to do this for f64 because a double-precision
17234 // reciprocal estimate with refinement on x86 prior to FMA requires
17235 // 15 instructions: convert to single, rcpss, convert back to double, refine
17236 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17237 // along with FMA, this could be a throughput win.
17238
17239 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17240 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17241 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17242 // Enable estimate codegen with 1 refinement step for vector division.
17243 // Scalar division estimates are disabled because they break too much
17244 // real-world code. These defaults are intended to match GCC behavior.
17245 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17246 return SDValue();
17247
17248 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17249 RefinementSteps = 1;
17250
17251 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17252 }
17253 return SDValue();
17254}
17255
17256/// If we have at least two divisions that use the same divisor, convert to
17257/// multiplication by a reciprocal. This may need to be adjusted for a given
17258/// CPU if a division's cost is not at least twice the cost of a multiplication.
17259/// This is because we still need one division to calculate the reciprocal and
17260/// then we need two multiplies by that reciprocal as replacements for the
17261/// original divisions.
17262unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17263 return 2;
17264}
17265
17266/// Helper for creating a X86ISD::SETCC node.
17267static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17268 SelectionDAG &DAG) {
17269 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17270 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17271}
17272
17273/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17274/// according to equal/not-equal condition code \p CC.
17275static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17276 const SDLoc &dl, SelectionDAG &DAG) {
17277 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17278 // instruction. Since the shift amount is in-range-or-undefined, we know
17279 // that doing a bittest on the i32 value is ok. We extend to i32 because
17280 // the encoding for the i16 version is larger than the i32 version.
17281 // Also promote i16 to i32 for performance / code size reason.
17282 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17283 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17284
17285 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17286 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17287 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17288 // known to be zero.
17289 if (Src.getValueType() == MVT::i64 &&
17290 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17291 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17292
17293 // If the operand types disagree, extend the shift amount to match. Since
17294 // BT ignores high bits (like shifts) we can use anyextend.
17295 if (Src.getValueType() != BitNo.getValueType())
17296 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17297
17298 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17299 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17300 return getSETCC(Cond, BT, dl , DAG);
17301}
17302
17303/// Result of 'and' is compared against zero. Change to a BT node if possible.
17304static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17305 const SDLoc &dl, SelectionDAG &DAG) {
17306 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17306, __extension__ __PRETTY_FUNCTION__))
;
17307 SDValue Op0 = And.getOperand(0);
17308 SDValue Op1 = And.getOperand(1);
17309 if (Op0.getOpcode() == ISD::TRUNCATE)
17310 Op0 = Op0.getOperand(0);
17311 if (Op1.getOpcode() == ISD::TRUNCATE)
17312 Op1 = Op1.getOperand(0);
17313
17314 SDValue LHS, RHS;
17315 if (Op1.getOpcode() == ISD::SHL)
17316 std::swap(Op0, Op1);
17317 if (Op0.getOpcode() == ISD::SHL) {
17318 if (isOneConstant(Op0.getOperand(0))) {
17319 // If we looked past a truncate, check that it's only truncating away
17320 // known zeros.
17321 unsigned BitWidth = Op0.getValueSizeInBits();
17322 unsigned AndBitWidth = And.getValueSizeInBits();
17323 if (BitWidth > AndBitWidth) {
17324 KnownBits Known;
17325 DAG.computeKnownBits(Op0, Known);
17326 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17327 return SDValue();
17328 }
17329 LHS = Op1;
17330 RHS = Op0.getOperand(1);
17331 }
17332 } else if (Op1.getOpcode() == ISD::Constant) {
17333 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17334 uint64_t AndRHSVal = AndRHS->getZExtValue();
17335 SDValue AndLHS = Op0;
17336
17337 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17338 LHS = AndLHS.getOperand(0);
17339 RHS = AndLHS.getOperand(1);
17340 }
17341
17342 // Use BT if the immediate can't be encoded in a TEST instruction.
17343 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17344 LHS = AndLHS;
17345 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17346 }
17347 }
17348
17349 if (LHS.getNode())
17350 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17351
17352 return SDValue();
17353}
17354
17355/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17356/// CMPs.
17357static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17358 SDValue &Op1) {
17359 unsigned SSECC;
17360 bool Swap = false;
17361
17362 // SSE Condition code mapping:
17363 // 0 - EQ
17364 // 1 - LT
17365 // 2 - LE
17366 // 3 - UNORD
17367 // 4 - NEQ
17368 // 5 - NLT
17369 // 6 - NLE
17370 // 7 - ORD
17371 switch (SetCCOpcode) {
17372 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17372)
;
17373 case ISD::SETOEQ:
17374 case ISD::SETEQ: SSECC = 0; break;
17375 case ISD::SETOGT:
17376 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17377 case ISD::SETLT:
17378 case ISD::SETOLT: SSECC = 1; break;
17379 case ISD::SETOGE:
17380 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17381 case ISD::SETLE:
17382 case ISD::SETOLE: SSECC = 2; break;
17383 case ISD::SETUO: SSECC = 3; break;
17384 case ISD::SETUNE:
17385 case ISD::SETNE: SSECC = 4; break;
17386 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17387 case ISD::SETUGE: SSECC = 5; break;
17388 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17389 case ISD::SETUGT: SSECC = 6; break;
17390 case ISD::SETO: SSECC = 7; break;
17391 case ISD::SETUEQ: SSECC = 8; break;
17392 case ISD::SETONE: SSECC = 12; break;
17393 }
17394 if (Swap)
17395 std::swap(Op0, Op1);
17396
17397 return SSECC;
17398}
17399
17400/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17401/// concatenate the result back.
17402static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17403 MVT VT = Op.getSimpleValueType();
17404
17405 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&(static_cast <bool> (VT.is256BitVector() && Op.
getOpcode() == ISD::SETCC && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17406, __extension__ __PRETTY_FUNCTION__))
17406 "Unsupported value type for operation")(static_cast <bool> (VT.is256BitVector() && Op.
getOpcode() == ISD::SETCC && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17406, __extension__ __PRETTY_FUNCTION__))
;
17407
17408 unsigned NumElems = VT.getVectorNumElements();
17409 SDLoc dl(Op);
17410 SDValue CC = Op.getOperand(2);
17411
17412 // Extract the LHS vectors
17413 SDValue LHS = Op.getOperand(0);
17414 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17415 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17416
17417 // Extract the RHS vectors
17418 SDValue RHS = Op.getOperand(1);
17419 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17420 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17421
17422 // Issue the operation on the smaller types and concatenate the result back
17423 MVT EltVT = VT.getVectorElementType();
17424 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17425 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17426 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17427 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17428}
17429
17430static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17431 SDValue Op0 = Op.getOperand(0);
17432 SDValue Op1 = Op.getOperand(1);
17433 SDValue CC = Op.getOperand(2);
17434 MVT VT = Op.getSimpleValueType();
17435 SDLoc dl(Op);
17436
17437 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op0.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Unexpected type for boolean compare operation"
) ? void (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17438, __extension__ __PRETTY_FUNCTION__))
17438 "Unexpected type for boolean compare operation")(static_cast <bool> (Op0.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Unexpected type for boolean compare operation"
) ? void (0) : __assert_fail ("Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Unexpected type for boolean compare operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17438, __extension__ __PRETTY_FUNCTION__))
;
17439 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17440 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17441 DAG.getConstant(-1, dl, VT));
17442 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17443 DAG.getConstant(-1, dl, VT));
17444 switch (SetCCOpcode) {
17445 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17445)
;
17446 case ISD::SETEQ:
17447 // (x == y) -> ~(x ^ y)
17448 return DAG.getNode(ISD::XOR, dl, VT,
17449 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17450 DAG.getConstant(-1, dl, VT));
17451 case ISD::SETNE:
17452 // (x != y) -> (x ^ y)
17453 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17454 case ISD::SETUGT:
17455 case ISD::SETGT:
17456 // (x > y) -> (x & ~y)
17457 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17458 case ISD::SETULT:
17459 case ISD::SETLT:
17460 // (x < y) -> (~x & y)
17461 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17462 case ISD::SETULE:
17463 case ISD::SETLE:
17464 // (x <= y) -> (~x | y)
17465 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17466 case ISD::SETUGE:
17467 case ISD::SETGE:
17468 // (x >=y) -> (x | ~y)
17469 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17470 }
17471}
17472
17473static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17474
17475 SDValue Op0 = Op.getOperand(0);
17476 SDValue Op1 = Op.getOperand(1);
17477 SDValue CC = Op.getOperand(2);
17478 MVT VT = Op.getSimpleValueType();
17479 SDLoc dl(Op);
17480
17481 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17482, __extension__ __PRETTY_FUNCTION__))
17482 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17482, __extension__ __PRETTY_FUNCTION__))
;
17483
17484 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17485 unsigned Opc = 0;
17486 bool Unsigned = false;
17487 bool Swap = false;
17488 unsigned SSECC;
17489 switch (SetCCOpcode) {
17490 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17490)
;
17491 case ISD::SETNE: SSECC = 4; break;
17492 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17493 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17494 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17495 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17496 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17497 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17498 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17499 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH[[clang::fallthrough]];
17500 case ISD::SETLE: SSECC = 2; break;
17501 }
17502
17503 if (Swap)
17504 std::swap(Op0, Op1);
17505
17506 // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
17507 if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
17508 SDValue A = peekThroughBitcasts(Op0);
17509 if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
17510 ISD::isBuildVectorAllZeros(Op1.getNode())) {
17511 MVT VT0 = Op0.getSimpleValueType();
17512 SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
17513 SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
17514 return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
17515 dl, VT, RHS, LHS);
17516 }
17517 }
17518
17519 if (Opc)
17520 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17521 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17522 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17523 DAG.getConstant(SSECC, dl, MVT::i8));
17524}
17525
17526/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17527/// operand \p Op1. If non-trivial (for example because it's not constant)
17528/// return an empty value.
17529static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17530 SelectionDAG &DAG) {
17531 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17532 if (!BV)
17533 return SDValue();
17534
17535 MVT VT = Op1.getSimpleValueType();
17536 MVT EVT = VT.getVectorElementType();
17537 unsigned n = VT.getVectorNumElements();
17538 SmallVector<SDValue, 8> ULTOp1;
17539
17540 for (unsigned i = 0; i < n; ++i) {
17541 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17542 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17543 return SDValue();
17544
17545 // Avoid underflow.
17546 APInt Val = Elt->getAPIntValue();
17547 if (Val == 0)
17548 return SDValue();
17549
17550 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17551 }
17552
17553 return DAG.getBuildVector(VT, dl, ULTOp1);
17554}
17555
17556static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17557 SelectionDAG &DAG) {
17558 SDValue Op0 = Op.getOperand(0);
17559 SDValue Op1 = Op.getOperand(1);
17560 SDValue CC = Op.getOperand(2);
17561 MVT VT = Op.getSimpleValueType();
17562 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17563 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17564 SDLoc dl(Op);
17565
17566 if (isFP) {
17567#ifndef NDEBUG
17568 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17569 assert(EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f32 || EltVT == MVT::
f64) ? void (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17569, __extension__ __PRETTY_FUNCTION__))
;
17570#endif
17571
17572 unsigned Opc;
17573 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17574 assert(VT.getVectorNumElements() <= 16)(static_cast <bool> (VT.getVectorNumElements() <= 16
) ? void (0) : __assert_fail ("VT.getVectorNumElements() <= 16"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17574, __extension__ __PRETTY_FUNCTION__))
;
17575 Opc = X86ISD::CMPM;
17576 } else {
17577 Opc = X86ISD::CMPP;
17578 // The SSE/AVX packed FP comparison nodes are defined with a
17579 // floating-point vector result that matches the operand type. This allows
17580 // them to work with an SSE1 target (integer vector types are not legal).
17581 VT = Op0.getSimpleValueType();
17582 }
17583
17584 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17585 // emit two comparisons and a logic op to tie them together.
17586 SDValue Cmp;
17587 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17588 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17589 // LLVM predicate is SETUEQ or SETONE.
17590 unsigned CC0, CC1;
17591 unsigned CombineOpc;
17592 if (Cond == ISD::SETUEQ) {
17593 CC0 = 3; // UNORD
17594 CC1 = 0; // EQ
17595 CombineOpc = X86ISD::FOR;
17596 } else {
17597 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17597, __extension__ __PRETTY_FUNCTION__))
;
17598 CC0 = 7; // ORD
17599 CC1 = 4; // NEQ
17600 CombineOpc = X86ISD::FAND;
17601 }
17602
17603 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17604 DAG.getConstant(CC0, dl, MVT::i8));
17605 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17606 DAG.getConstant(CC1, dl, MVT::i8));
17607 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17608 } else {
17609 // Handle all other FP comparisons here.
17610 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17611 DAG.getConstant(SSECC, dl, MVT::i8));
17612 }
17613
17614 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17615 // result type of SETCC. The bitcast is expected to be optimized away
17616 // during combining/isel.
17617 if (Opc == X86ISD::CMPP)
17618 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17619
17620 return Cmp;
17621 }
17622
17623 MVT VTOp0 = Op0.getSimpleValueType();
17624 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17625, __extension__ __PRETTY_FUNCTION__))
17625 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17625, __extension__ __PRETTY_FUNCTION__))
;
17626 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17627, __extension__ __PRETTY_FUNCTION__))
17627 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17627, __extension__ __PRETTY_FUNCTION__))
;
17628
17629 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17630 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17631 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17632 // legalizer firstly checks if the first operand in input to the setcc has
17633 // a legal type. If so, then it promotes the return type to that same type.
17634 // Otherwise, the return type is promoted to the 'next legal type' which,
17635 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17636 //
17637 // We reach this code only if the following two conditions are met:
17638 // 1. Both return type and operand type have been promoted to wider types
17639 // by the type legalizer.
17640 // 2. The original operand type has been promoted to a 256-bit vector.
17641 //
17642 // Note that condition 2. only applies for AVX targets.
17643 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17644 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17645 }
17646
17647 // The non-AVX512 code below works under the assumption that source and
17648 // destination types are the same.
17649 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17650, __extension__ __PRETTY_FUNCTION__))
17650 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17650, __extension__ __PRETTY_FUNCTION__))
;
17651
17652 // Break 256-bit integer vector compare into smaller ones.
17653 if (VT.is256BitVector() && !Subtarget.hasInt256())
17654 return Lower256IntVSETCC(Op, DAG);
17655
17656 // Operands are boolean (vectors of i1)
17657 MVT OpVT = Op1.getSimpleValueType();
17658 if (OpVT.getVectorElementType() == MVT::i1)
17659 return LowerBoolVSETCC_AVX512(Op, DAG);
17660
17661 // The result is boolean, but operands are int/float
17662 if (VT.getVectorElementType() == MVT::i1) {
17663 // In AVX-512 architecture setcc returns mask with i1 elements,
17664 // But there is no compare instruction for i8 and i16 elements in KNL.
17665 // In this case use SSE compare
17666 bool UseAVX512Inst =
17667 (OpVT.is512BitVector() ||
17668 OpVT.getScalarSizeInBits() >= 32 ||
17669 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17670
17671 if (UseAVX512Inst)
17672 return LowerIntVSETCC_AVX512(Op, DAG);
17673
17674 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17675 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17676 }
17677
17678 // Lower using XOP integer comparisons.
17679 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17680 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17681 // Translate compare code to XOP PCOM compare mode.
17682 unsigned CmpMode = 0;
17683 switch (Cond) {
17684 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17684)
;
17685 case ISD::SETULT:
17686 case ISD::SETLT: CmpMode = 0x00; break;
17687 case ISD::SETULE:
17688 case ISD::SETLE: CmpMode = 0x01; break;
17689 case ISD::SETUGT:
17690 case ISD::SETGT: CmpMode = 0x02; break;
17691 case ISD::SETUGE:
17692 case ISD::SETGE: CmpMode = 0x03; break;
17693 case ISD::SETEQ: CmpMode = 0x04; break;
17694 case ISD::SETNE: CmpMode = 0x05; break;
17695 }
17696
17697 // Are we comparing unsigned or signed integers?
17698 unsigned Opc =
17699 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17700
17701 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17702 DAG.getConstant(CmpMode, dl, MVT::i8));
17703 }
17704
17705 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17706 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17707 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17708 SDValue BC0 = peekThroughBitcasts(Op0);
17709 if (BC0.getOpcode() == ISD::AND) {
17710 APInt UndefElts;
17711 SmallVector<APInt, 64> EltBits;
17712 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17713 VT.getScalarSizeInBits(), UndefElts,
17714 EltBits, false, false)) {
17715 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
17716 Cond = ISD::SETEQ;
17717 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17718 }
17719 }
17720 }
17721 }
17722
17723 // We are handling one of the integer comparisons here. Since SSE only has
17724 // GT and EQ comparisons for integer, swapping operands and multiple
17725 // operations may be required for some comparisons.
17726 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17727 : X86ISD::PCMPGT;
17728 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17729 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17730 bool Invert = Cond == ISD::SETNE ||
17731 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17732
17733 // If both operands are known non-negative, then an unsigned compare is the
17734 // same as a signed compare and there's no need to flip signbits.
17735 // TODO: We could check for more general simplifications here since we're
17736 // computing known bits.
17737 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17738 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17739
17740 // Special case: Use min/max operations for SETULE/SETUGE
17741 MVT VET = VT.getVectorElementType();
17742 bool HasMinMax =
17743 (Subtarget.hasAVX512() && VET == MVT::i64) ||
17744 (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
17745 (Subtarget.hasSSE2() && (VET == MVT::i8));
17746 bool MinMax = false;
17747 if (HasMinMax) {
17748 switch (Cond) {
17749 default: break;
17750 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17751 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17752 }
17753
17754 if (MinMax)
17755 Swap = Invert = FlipSigns = false;
17756 }
17757
17758 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17759 bool Subus = false;
17760 if (!MinMax && HasSubus) {
17761 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17762 // Op0 u<= Op1:
17763 // t = psubus Op0, Op1
17764 // pcmpeq t, <0..0>
17765 switch (Cond) {
17766 default: break;
17767 case ISD::SETULT: {
17768 // If the comparison is against a constant we can turn this into a
17769 // setule. With psubus, setule does not require a swap. This is
17770 // beneficial because the constant in the register is no longer
17771 // destructed as the destination so it can be hoisted out of a loop.
17772 // Only do this pre-AVX since vpcmp* is no longer destructive.
17773 if (Subtarget.hasAVX())
17774 break;
17775 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17776 Op1 = ULEOp1;
17777 Subus = true; Invert = false; Swap = false;
17778 }
17779 break;
17780 }
17781 // Psubus is better than flip-sign because it requires no inversion.
17782 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17783 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17784 }
17785
17786 if (Subus) {
17787 Opc = X86ISD::SUBUS;
17788 FlipSigns = false;
17789 }
17790 }
17791
17792 if (Swap)
17793 std::swap(Op0, Op1);
17794
17795 // Check that the operation in question is available (most are plain SSE2,
17796 // but PCMPGTQ and PCMPEQQ have different requirements).
17797 if (VT == MVT::v2i64) {
17798 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17799 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17799, __extension__ __PRETTY_FUNCTION__))
;
17800
17801 // First cast everything to the right type.
17802 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17803 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17804
17805 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17806 // bits of the inputs before performing those operations. The lower
17807 // compare is always unsigned.
17808 SDValue SB;
17809 if (FlipSigns) {
17810 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17811 } else {
17812 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17813 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17814 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17815 }
17816 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17817 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17818
17819 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17820 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17821 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17822
17823 // Create masks for only the low parts/high parts of the 64 bit integers.
17824 static const int MaskHi[] = { 1, 1, 3, 3 };
17825 static const int MaskLo[] = { 0, 0, 2, 2 };
17826 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17827 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17828 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17829
17830 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17831 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17832
17833 if (Invert)
17834 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17835
17836 return DAG.getBitcast(VT, Result);
17837 }
17838
17839 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17840 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17841 // pcmpeqd + pshufd + pand.
17842 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17842, __extension__ __PRETTY_FUNCTION__))
;
17843
17844 // First cast everything to the right type.
17845 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17846 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17847
17848 // Do the compare.
17849 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17850
17851 // Make sure the lower and upper halves are both all-ones.
17852 static const int Mask[] = { 1, 0, 3, 2 };
17853 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17854 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17855
17856 if (Invert)
17857 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17858
17859 return DAG.getBitcast(VT, Result);
17860 }
17861 }
17862
17863 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17864 // bits of the inputs before performing those operations.
17865 if (FlipSigns) {
17866 MVT EltVT = VT.getVectorElementType();
17867 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17868 VT);
17869 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17870 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17871 }
17872
17873 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17874
17875 // If the logical-not of the result is required, perform that now.
17876 if (Invert)
17877 Result = DAG.getNOT(dl, Result, VT);
17878
17879 if (MinMax)
17880 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17881
17882 if (Subus)
17883 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17884 getZeroVector(VT, Subtarget, DAG, dl));
17885
17886 return Result;
17887}
17888
17889SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17890
17891 MVT VT = Op.getSimpleValueType();
17892
17893 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17894
17895 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17895, __extension__ __PRETTY_FUNCTION__))
;
17896 SDValue Op0 = Op.getOperand(0);
17897 SDValue Op1 = Op.getOperand(1);
17898 SDLoc dl(Op);
17899 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17900
17901 // Optimize to BT if possible.
17902 // Lower (X & (1 << N)) == 0 to BT(X, N).
17903 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17904 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17905 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
17906 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17907 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
17908 return NewSetCC;
17909 }
17910
17911 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17912 // these.
17913 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17914 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17915
17916 // If the input is a setcc, then reuse the input setcc or use a new one with
17917 // the inverted condition.
17918 if (Op0.getOpcode() == X86ISD::SETCC) {
17919 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17920 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17921 if (!Invert)
17922 return Op0;
17923
17924 CCode = X86::GetOppositeBranchCondition(CCode);
17925 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17926 }
17927 }
17928
17929 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17930 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17931 if (X86CC == X86::COND_INVALID)
17932 return SDValue();
17933
17934 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17935 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17936 return getSETCC(X86CC, EFLAGS, dl, DAG);
17937}
17938
17939SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17940 SDValue LHS = Op.getOperand(0);
17941 SDValue RHS = Op.getOperand(1);
17942 SDValue Carry = Op.getOperand(2);
17943 SDValue Cond = Op.getOperand(3);
17944 SDLoc DL(Op);
17945
17946 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 17946, __extension__ __PRETTY_FUNCTION__))
;
17947 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17948
17949 // Recreate the carry if needed.
17950 EVT CarryVT = Carry.getValueType();
17951 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17952 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17953 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17954
17955 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17956 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17957 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
17958}
17959
17960/// Return true if opcode is a X86 logical comparison.
17961static bool isX86LogicalCmp(SDValue Op) {
17962 unsigned Opc = Op.getOpcode();
17963 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17964 Opc == X86ISD::SAHF)
17965 return true;
17966 if (Op.getResNo() == 1 &&
17967 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17968 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
17969 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17970 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17971 return true;
17972
17973 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17974 return true;
17975
17976 return false;
17977}
17978
17979static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17980 if (V.getOpcode() != ISD::TRUNCATE)
17981 return false;
17982
17983 SDValue VOp0 = V.getOperand(0);
17984 unsigned InBits = VOp0.getValueSizeInBits();
17985 unsigned Bits = V.getValueSizeInBits();
17986 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17987}
17988
17989SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17990 bool AddTest = true;
17991 SDValue Cond = Op.getOperand(0);
17992 SDValue Op1 = Op.getOperand(1);
17993 SDValue Op2 = Op.getOperand(2);
17994 SDLoc DL(Op);
17995 MVT VT = Op1.getSimpleValueType();
17996 SDValue CC;
17997
17998 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17999 // are available or VBLENDV if AVX is available.
18000 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18001 if (Cond.getOpcode() == ISD::SETCC &&
18002 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
18003 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18004 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18005 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18006 unsigned SSECC = translateX86FSETCC(
18007 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18008
18009 if (Subtarget.hasAVX512()) {
18010 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18011 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18012 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18012, __extension__ __PRETTY_FUNCTION__))
;
18013 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18014 }
18015
18016 if (SSECC < 8 || Subtarget.hasAVX()) {
18017 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18018 DAG.getConstant(SSECC, DL, MVT::i8));
18019
18020 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18021 // of 3 logic instructions for size savings and potentially speed.
18022 // Unfortunately, there is no scalar form of VBLENDV.
18023
18024 // If either operand is a constant, don't try this. We can expect to
18025 // optimize away at least one of the logic instructions later in that
18026 // case, so that sequence would be faster than a variable blend.
18027
18028 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18029 // uses XMM0 as the selection register. That may need just as many
18030 // instructions as the AND/ANDN/OR sequence due to register moves, so
18031 // don't bother.
18032
18033 if (Subtarget.hasAVX() &&
18034 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18035
18036 // Convert to vectors, do a VSELECT, and convert back to scalar.
18037 // All of the conversions should be optimized away.
18038
18039 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18040 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18041 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18042 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18043
18044 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18045 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18046
18047 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18048
18049 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18050 VSel, DAG.getIntPtrConstant(0, DL));
18051 }
18052 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18053 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18054 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18055 }
18056 }
18057
18058 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18059 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18060 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18061 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18062 }
18063
18064 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18065 SDValue Op1Scalar;
18066 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18067 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18068 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18069 Op1Scalar = Op1.getOperand(0);
18070 SDValue Op2Scalar;
18071 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18072 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18073 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18074 Op2Scalar = Op2.getOperand(0);
18075 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18076 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18077 Op1Scalar, Op2Scalar);
18078 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18079 return DAG.getBitcast(VT, newSelect);
18080 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18081 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18082 DAG.getIntPtrConstant(0, DL));
18083 }
18084 }
18085
18086 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18087 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18088 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18089 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18090 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18091 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18092 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18093 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18094 }
18095
18096 if (Cond.getOpcode() == ISD::SETCC) {
18097 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18098 Cond = NewCond;
18099 // If the condition was updated, it's possible that the operands of the
18100 // select were also updated (for example, EmitTest has a RAUW). Refresh
18101 // the local references to the select operands in case they got stale.
18102 Op1 = Op.getOperand(1);
18103 Op2 = Op.getOperand(2);
18104 }
18105 }
18106
18107 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18108 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18109 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18110 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18111 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18112 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18113 if (Cond.getOpcode() == X86ISD::SETCC &&
18114 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18115 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18116 SDValue Cmp = Cond.getOperand(1);
18117 unsigned CondCode =
18118 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18119
18120 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18121 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18122 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18123 SDValue CmpOp0 = Cmp.getOperand(0);
18124
18125 // Apply further optimizations for special cases
18126 // (select (x != 0), -1, 0) -> neg & sbb
18127 // (select (x == 0), 0, -1) -> neg & sbb
18128 if (isNullConstant(Y) &&
18129 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18130 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18131 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18132 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18133 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18134 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18135 SDValue(Neg.getNode(), 1));
18136 return Res;
18137 }
18138
18139 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18140 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18141 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18142
18143 SDValue Res = // Res = 0 or -1.
18144 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18145 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18146
18147 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18148 Res = DAG.getNOT(DL, Res, Res.getValueType());
18149
18150 if (!isNullConstant(Op2))
18151 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18152 return Res;
18153 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18154 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18155 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18156 SDValue CmpOp0 = Cmp.getOperand(0);
18157 SDValue Src1, Src2;
18158 // true if Op2 is XOR or OR operator and one of its operands
18159 // is equal to Op1
18160 // ( a , a op b) || ( b , a op b)
18161 auto isOrXorPattern = [&]() {
18162 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18163 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18164 Src1 =
18165 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18166 Src2 = Op1;
18167 return true;
18168 }
18169 return false;
18170 };
18171
18172 if (isOrXorPattern()) {
18173 SDValue Neg;
18174 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18175 // we need mask of all zeros or ones with same size of the other
18176 // operands.
18177 if (CmpSz > VT.getSizeInBits())
18178 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18179 else if (CmpSz < VT.getSizeInBits())
18180 Neg = DAG.getNode(ISD::AND, DL, VT,
18181 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18182 DAG.getConstant(1, DL, VT));
18183 else
18184 Neg = CmpOp0;
18185 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18186 Neg); // -(and (x, 0x1))
18187 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18188 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18189 }
18190 }
18191 }
18192
18193 // Look past (and (setcc_carry (cmp ...)), 1).
18194 if (Cond.getOpcode() == ISD::AND &&
18195 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18196 isOneConstant(Cond.getOperand(1)))
18197 Cond = Cond.getOperand(0);
18198
18199 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18200 // setting operand in place of the X86ISD::SETCC.
18201 unsigned CondOpcode = Cond.getOpcode();
18202 if (CondOpcode == X86ISD::SETCC ||
18203 CondOpcode == X86ISD::SETCC_CARRY) {
18204 CC = Cond.getOperand(0);
18205
18206 SDValue Cmp = Cond.getOperand(1);
18207 unsigned Opc = Cmp.getOpcode();
18208 MVT VT = Op.getSimpleValueType();
18209
18210 bool IllegalFPCMov = false;
18211 if (VT.isFloatingPoint() && !VT.isVector() &&
18212 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18213 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18214
18215 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18216 Opc == X86ISD::BT) { // FIXME
18217 Cond = Cmp;
18218 AddTest = false;
18219 }
18220 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18221 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18222 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18223 Cond.getOperand(0).getValueType() != MVT::i8)) {
18224 SDValue LHS = Cond.getOperand(0);
18225 SDValue RHS = Cond.getOperand(1);
18226 unsigned X86Opcode;
18227 unsigned X86Cond;
18228 SDVTList VTs;
18229 switch (CondOpcode) {
18230 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18231 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18232 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18233 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18234 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18235 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18236 default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18236)
;
18237 }
18238 if (CondOpcode == ISD::UMULO)
18239 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18240 MVT::i32);
18241 else
18242 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18243
18244 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18245
18246 if (CondOpcode == ISD::UMULO)
18247 Cond = X86Op.getValue(2);
18248 else
18249 Cond = X86Op.getValue(1);
18250
18251 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18252 AddTest = false;
18253 }
18254
18255 if (AddTest) {
18256 // Look past the truncate if the high bits are known zero.
18257 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18258 Cond = Cond.getOperand(0);
18259
18260 // We know the result of AND is compared against zero. Try to match
18261 // it to BT.
18262 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18263 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18264 CC = NewSetCC.getOperand(0);
18265 Cond = NewSetCC.getOperand(1);
18266 AddTest = false;
18267 }
18268 }
18269 }
18270
18271 if (AddTest) {
18272 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18273 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18274 }
18275
18276 // a < b ? -1 : 0 -> RES = ~setcc_carry
18277 // a < b ? 0 : -1 -> RES = setcc_carry
18278 // a >= b ? -1 : 0 -> RES = setcc_carry
18279 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18280 if (Cond.getOpcode() == X86ISD::SUB) {
18281 Cond = ConvertCmpIfNecessary(Cond, DAG);
18282 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18283
18284 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18285 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18286 (isNullConstant(Op1) || isNullConstant(Op2))) {
18287 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18288 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18289 Cond);
18290 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18291 return DAG.getNOT(DL, Res, Res.getValueType());
18292 return Res;
18293 }
18294 }
18295
18296 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18297 // widen the cmov and push the truncate through. This avoids introducing a new
18298 // branch during isel and doesn't add any extensions.
18299 if (Op.getValueType() == MVT::i8 &&
18300 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18301 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18302 if (T1.getValueType() == T2.getValueType() &&
18303 // Blacklist CopyFromReg to avoid partial register stalls.
18304 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18305 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18306 CC, Cond);
18307 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18308 }
18309 }
18310
18311 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18312 // condition is true.
18313 SDValue Ops[] = { Op2, Op1, CC, Cond };
18314 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18315}
18316
18317static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18318 const X86Subtarget &Subtarget,
18319 SelectionDAG &DAG) {
18320 MVT VT = Op->getSimpleValueType(0);
18321 SDValue In = Op->getOperand(0);
18322 MVT InVT = In.getSimpleValueType();
18323 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18323, __extension__ __PRETTY_FUNCTION__))
;
18324 MVT VTElt = VT.getVectorElementType();
18325 SDLoc dl(Op);
18326
18327 unsigned NumElts = VT.getVectorNumElements();
18328
18329 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18330 MVT ExtVT = VT;
18331 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
18332 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18333
18334 // Widen to 512-bits if VLX is not supported.
18335 MVT WideVT = ExtVT;
18336 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18337 NumElts *= 512 / ExtVT.getSizeInBits();
18338 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18339 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18340 In, DAG.getIntPtrConstant(0, dl));
18341 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18342 }
18343
18344 SDValue V;
18345 MVT WideEltVT = WideVT.getVectorElementType();
18346 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18347 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18348 V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
18349 } else {
18350 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18351 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18352 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18353 }
18354
18355 // Truncate if we had to extend i16/i8 above.
18356 if (VT != ExtVT) {
18357 WideVT = MVT::getVectorVT(VTElt, NumElts);
18358 V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V);
18359 }
18360
18361 // Extract back to 128/256-bit if we widened.
18362 if (WideVT != VT)
18363 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18364 DAG.getIntPtrConstant(0, dl));
18365
18366 return V;
18367}
18368
18369// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18370// For sign extend this needs to handle all vector sizes and SSE4.1 and
18371// non-SSE4.1 targets. For zero extend this should only handle inputs of
18372// MVT::v64i8 when BWI is not supported, but AVX512 is.
18373static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18374 const X86Subtarget &Subtarget,
18375 SelectionDAG &DAG) {
18376 SDValue In = Op->getOperand(0);
18377 MVT VT = Op->getSimpleValueType(0);
18378 MVT InVT = In.getSimpleValueType();
18379 assert(VT.getSizeInBits() == InVT.getSizeInBits())(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
()) ? void (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18379, __extension__ __PRETTY_FUNCTION__))
;
18380
18381 MVT SVT = VT.getVectorElementType();
18382 MVT InSVT = InVT.getVectorElementType();
18383 assert(SVT.getSizeInBits() > InSVT.getSizeInBits())(static_cast <bool> (SVT.getSizeInBits() > InSVT.getSizeInBits
()) ? void (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18383, __extension__ __PRETTY_FUNCTION__))
;
18384
18385 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18386 return SDValue();
18387 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18388 return SDValue();
18389 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18390 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18391 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18392 return SDValue();
18393
18394 SDLoc dl(Op);
18395
18396 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18397 // For 512-bit vectors, we need 128-bits or 256-bits.
18398 if (VT.getSizeInBits() > 128) {
18399 // Input needs to be at least the same number of elements as output, and
18400 // at least 128-bits.
18401 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18402 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18403 }
18404
18405 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||(static_cast <bool> ((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG
|| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"
) ? void (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18406, __extension__ __PRETTY_FUNCTION__))
18406 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!")(static_cast <bool> ((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG
|| InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"
) ? void (0) : __assert_fail ("(Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && \"Zero extend only for v64i8 input!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18406, __extension__ __PRETTY_FUNCTION__))
;
18407
18408 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18409 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18410 // need to be handled here for 256/512-bit results.
18411 if (Subtarget.hasInt256()) {
18412 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18412, __extension__ __PRETTY_FUNCTION__))
;
18413 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18414 X86ISD::VSEXT : X86ISD::VZEXT;
18415 return DAG.getNode(ExtOpc, dl, VT, In);
18416 }
18417
18418 // We should only get here for sign extend.
18419 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&(static_cast <bool> (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18420, __extension__ __PRETTY_FUNCTION__))
18420 "Unexpected opcode!")(static_cast <bool> (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18420, __extension__ __PRETTY_FUNCTION__))
;
18421
18422 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18423 SDValue Curr = In;
18424 MVT CurrVT = InVT;
18425
18426 // As SRAI is only available on i16/i32 types, we expand only up to i32
18427 // and handle i64 separately.
18428 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18429 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18430 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18431 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18432 Curr = DAG.getBitcast(CurrVT, Curr);
18433 }
18434
18435 SDValue SignExt = Curr;
18436 if (CurrVT != InVT) {
18437 unsigned SignExtShift =
18438 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18439 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18440 DAG.getConstant(SignExtShift, dl, MVT::i8));
18441 }
18442
18443 if (CurrVT == VT)
18444 return SignExt;
18445
18446 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18447 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18448 DAG.getConstant(31, dl, MVT::i8));
18449 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18450 return DAG.getBitcast(VT, Ext);
18451 }
18452
18453 return SDValue();
18454}
18455
18456static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18457 SelectionDAG &DAG) {
18458 MVT VT = Op->getSimpleValueType(0);
18459 SDValue In = Op->getOperand(0);
18460 MVT InVT = In.getSimpleValueType();
18461 SDLoc dl(Op);
18462
18463 if (InVT.getVectorElementType() == MVT::i1)
18464 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18465
18466 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18467 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18468 (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
18469 (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
18470 (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
18471 (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
18472 (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
18473 (VT != MVT::v32i16 || InVT != MVT::v32i8))
18474 return SDValue();
18475
18476 if (Subtarget.hasInt256())
18477 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18478
18479 // Optimize vectors in AVX mode
18480 // Sign extend v8i16 to v8i32 and
18481 // v4i32 to v4i64
18482 //
18483 // Divide input vector into two parts
18484 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18485 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18486 // concat the vectors to original VT
18487
18488 unsigned NumElems = InVT.getVectorNumElements();
18489 SDValue Undef = DAG.getUNDEF(InVT);
18490
18491 SmallVector<int,8> ShufMask1(NumElems, -1);
18492 for (unsigned i = 0; i != NumElems/2; ++i)
18493 ShufMask1[i] = i;
18494
18495 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18496
18497 SmallVector<int,8> ShufMask2(NumElems, -1);
18498 for (unsigned i = 0; i != NumElems/2; ++i)
18499 ShufMask2[i] = i + NumElems/2;
18500
18501 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18502
18503 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18504 VT.getVectorNumElements() / 2);
18505
18506 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18507 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18508
18509 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18510}
18511
18512// Lower truncating store. We need a special lowering to vXi1 vectors
18513static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18514 SelectionDAG &DAG) {
18515 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18516 SDLoc dl(St);
18517 EVT MemVT = St->getMemoryVT();
18518 assert(St->isTruncatingStore() && "We only custom truncating store.")(static_cast <bool> (St->isTruncatingStore() &&
"We only custom truncating store.") ? void (0) : __assert_fail
("St->isTruncatingStore() && \"We only custom truncating store.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18518, __extension__ __PRETTY_FUNCTION__))
;
18519 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (MemVT.isVector() && MemVT.
getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18520, __extension__ __PRETTY_FUNCTION__))
18520 "Expected truncstore of i1 vector")(static_cast <bool> (MemVT.isVector() && MemVT.
getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && \"Expected truncstore of i1 vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18520, __extension__ __PRETTY_FUNCTION__))
;
18521
18522 SDValue Op = St->getValue();
18523 MVT OpVT = Op.getValueType().getSimpleVT();
18524 unsigned NumElts = OpVT.getVectorNumElements();
18525 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18526 NumElts == 16) {
18527 // Truncate and store - everything is legal
18528 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18529 if (MemVT.getSizeInBits() < 8)
18530 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18531 DAG.getUNDEF(MVT::v8i1), Op,
18532 DAG.getIntPtrConstant(0, dl));
18533 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18534 St->getMemOperand());
18535 }
18536
18537 // A subset, assume that we have only AVX-512F
18538 if (NumElts <= 8) {
18539 if (NumElts < 8) {
18540 // Extend to 8-elts vector
18541 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18542 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18543 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18544 }
18545 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18546 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18547 St->getMemOperand());
18548 }
18549 // v32i8
18550 assert(OpVT == MVT::v32i8 && "Unexpected operand type")(static_cast <bool> (OpVT == MVT::v32i8 && "Unexpected operand type"
) ? void (0) : __assert_fail ("OpVT == MVT::v32i8 && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18550, __extension__ __PRETTY_FUNCTION__))
;
18551 // Divide the vector into 2 parts and store each part separately
18552 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18553 DAG.getIntPtrConstant(0, dl));
18554 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18555 SDValue BasePtr = St->getBasePtr();
18556 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18557 St->getMemOperand());
18558 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18559 DAG.getIntPtrConstant(16, dl));
18560 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18561
18562 SDValue BasePtrHi =
18563 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18564 DAG.getConstant(2, dl, BasePtr.getValueType()));
18565
18566 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18567 BasePtrHi, St->getMemOperand());
18568 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18569}
18570
18571static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18572 const X86Subtarget &Subtarget,
18573 SelectionDAG &DAG) {
18574
18575 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18576 SDLoc dl(Ld);
18577 EVT MemVT = Ld->getMemoryVT();
18578 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&(static_cast <bool> (MemVT.isVector() && MemVT.
getScalarType() == MVT::i1 && "Expected i1 vector load"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18579, __extension__ __PRETTY_FUNCTION__))
18579 "Expected i1 vector load")(static_cast <bool> (MemVT.isVector() && MemVT.
getScalarType() == MVT::i1 && "Expected i1 vector load"
) ? void (0) : __assert_fail ("MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && \"Expected i1 vector load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18579, __extension__ __PRETTY_FUNCTION__))
;
18580 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18581 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18582 MVT VT = Op.getValueType().getSimpleVT();
18583 unsigned NumElts = VT.getVectorNumElements();
18584
18585 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18586 (Subtarget.hasDQI() && NumElts < 16) ||
18587 NumElts == 16) {
18588 // Load and extend - everything is legal
18589 if (NumElts < 8) {
18590 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18591 Ld->getBasePtr(),
18592 Ld->getMemOperand());
18593 // Replace chain users with the new chain.
18594 assert(Load->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (Load->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18594, __extension__ __PRETTY_FUNCTION__))
;
18595 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18596 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18597 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18598
18599 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18600 DAG.getIntPtrConstant(0, dl));
18601 }
18602 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18603 Ld->getBasePtr(),
18604 Ld->getMemOperand());
18605 // Replace chain users with the new chain.
18606 assert(Load->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (Load->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18606, __extension__ __PRETTY_FUNCTION__))
;
18607 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18608
18609 // Finally, do a normal sign-extend to the desired register.
18610 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18611 }
18612
18613 if (NumElts <= 8) {
18614 // A subset, assume that we have only AVX-512F
18615 unsigned NumBitsToLoad = 8;
18616 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18617 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18618 Ld->getBasePtr(),
18619 Ld->getMemOperand());
18620 // Replace chain users with the new chain.
18621 assert(Load->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (Load->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18621, __extension__ __PRETTY_FUNCTION__))
;
18622 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18623
18624 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18625 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18626
18627 if (NumElts == 8)
18628 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18629
18630 // we should take care to v4i1 and v2i1
18631
18632 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18633 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18634 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18635 DAG.getIntPtrConstant(0, dl));
18636 }
18637
18638 assert(VT == MVT::v32i8 && "Unexpected extload type")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected extload type"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected extload type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18638, __extension__ __PRETTY_FUNCTION__))
;
18639
18640 SmallVector<SDValue, 2> Chains;
18641
18642 SDValue BasePtr = Ld->getBasePtr();
18643 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18644 Ld->getBasePtr(),
18645 Ld->getMemOperand());
18646 Chains.push_back(LoadLo.getValue(1));
18647
18648 SDValue BasePtrHi =
18649 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18650 DAG.getConstant(2, dl, BasePtr.getValueType()));
18651
18652 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18653 BasePtrHi,
18654 Ld->getMemOperand());
18655 Chains.push_back(LoadHi.getValue(1));
18656 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18657 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18658
18659 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18660 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18661 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18662}
18663
18664// Lower vector extended loads using a shuffle. If SSSE3 is not available we
18665// may emit an illegal shuffle but the expansion is still better than scalar
18666// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18667// we'll emit a shuffle and a arithmetic shift.
18668// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18669// TODO: It is possible to support ZExt by zeroing the undef values during
18670// the shuffle phase or after the shuffle.
18671static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18672 SelectionDAG &DAG) {
18673 MVT RegVT = Op.getSimpleValueType();
18674 assert(RegVT.isVector() && "We only custom lower vector sext loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18674, __extension__ __PRETTY_FUNCTION__))
;
18675 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18676, __extension__ __PRETTY_FUNCTION__))
18676 "We only custom lower integer vector sext loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector sext loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector sext loads.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18676, __extension__ __PRETTY_FUNCTION__))
;
18677
18678 // Nothing useful we can do without SSE2 shuffles.
18679 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.")(static_cast <bool> (Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"We only custom lower sext loads with SSE2.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18679, __extension__ __PRETTY_FUNCTION__))
;
18680
18681 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18682 SDLoc dl(Ld);
18683 EVT MemVT = Ld->getMemoryVT();
18684 if (MemVT.getScalarType() == MVT::i1)
18685 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18686
18687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18688 unsigned RegSz = RegVT.getSizeInBits();
18689
18690 ISD::LoadExtType Ext = Ld->getExtensionType();
18691
18692 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)(static_cast <bool> ((Ext == ISD::EXTLOAD || Ext == ISD
::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? void (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18693, __extension__ __PRETTY_FUNCTION__))
18693 && "Only anyext and sext are currently implemented.")(static_cast <bool> ((Ext == ISD::EXTLOAD || Ext == ISD
::SEXTLOAD) && "Only anyext and sext are currently implemented."
) ? void (0) : __assert_fail ("(Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && \"Only anyext and sext are currently implemented.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18693, __extension__ __PRETTY_FUNCTION__))
;
18694 assert(MemVT != RegVT && "Cannot extend to the same type")(static_cast <bool> (MemVT != RegVT && "Cannot extend to the same type"
) ? void (0) : __assert_fail ("MemVT != RegVT && \"Cannot extend to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18694, __extension__ __PRETTY_FUNCTION__))
;
18695 assert(MemVT.isVector() && "Must load a vector from memory")(static_cast <bool> (MemVT.isVector() && "Must load a vector from memory"
) ? void (0) : __assert_fail ("MemVT.isVector() && \"Must load a vector from memory\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18695, __extension__ __PRETTY_FUNCTION__))
;
18696
18697 unsigned NumElems = RegVT.getVectorNumElements();
18698 unsigned MemSz = MemVT.getSizeInBits();
18699 assert(RegSz > MemSz && "Register size must be greater than the mem size")(static_cast <bool> (RegSz > MemSz && "Register size must be greater than the mem size"
) ? void (0) : __assert_fail ("RegSz > MemSz && \"Register size must be greater than the mem size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18699, __extension__ __PRETTY_FUNCTION__))
;
18700
18701 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18702 // The only way in which we have a legal 256-bit vector result but not the
18703 // integer 256-bit operations needed to directly lower a sextload is if we
18704 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18705 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18706 // correctly legalized. We do this late to allow the canonical form of
18707 // sextload to persist throughout the rest of the DAG combiner -- it wants
18708 // to fold together any extensions it can, and so will fuse a sign_extend
18709 // of an sextload into a sextload targeting a wider value.
18710 SDValue Load;
18711 if (MemSz == 128) {
18712 // Just switch this to a normal load.
18713 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__))
18714 "it must be a legal 128-bit vector "(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__))
18715 "type!")(static_cast <bool> (TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector " "type!") ? void (0) : __assert_fail
("TLI.isTypeLegal(MemVT) && \"If the memory type is a 128-bit type, \" \"it must be a legal 128-bit vector \" \"type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18715, __extension__ __PRETTY_FUNCTION__))
;
18716 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18717 Ld->getPointerInfo(), Ld->getAlignment(),
18718 Ld->getMemOperand()->getFlags());
18719 } else {
18720 assert(MemSz < 128 &&(static_cast <bool> (MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? void (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18721, __extension__ __PRETTY_FUNCTION__))
18721 "Can't extend a type wider than 128 bits to a 256 bit vector!")(static_cast <bool> (MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"
) ? void (0) : __assert_fail ("MemSz < 128 && \"Can't extend a type wider than 128 bits to a 256 bit vector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18721, __extension__ __PRETTY_FUNCTION__))
;
18722 // Do an sext load to a 128-bit vector type. We want to use the same
18723 // number of elements, but elements half as wide. This will end up being
18724 // recursively lowered by this routine, but will succeed as we definitely
18725 // have all the necessary features if we're using AVX1.
18726 EVT HalfEltVT =
18727 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18728 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18729 Load =
18730 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18731 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18732 Ld->getMemOperand()->getFlags());
18733 }
18734
18735 // Replace chain users with the new chain.
18736 assert(Load->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (Load->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("Load->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18736, __extension__ __PRETTY_FUNCTION__))
;
18737 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18738
18739 // Finally, do a normal sign-extend to the desired register.
18740 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18741 }
18742
18743 // All sizes must be a power of two.
18744 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&(static_cast <bool> (isPowerOf2_32(RegSz * MemSz * NumElems
) && "Non-power-of-two elements are not custom lowered!"
) ? void (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18745, __extension__ __PRETTY_FUNCTION__))
18745 "Non-power-of-two elements are not custom lowered!")(static_cast <bool> (isPowerOf2_32(RegSz * MemSz * NumElems
) && "Non-power-of-two elements are not custom lowered!"
) ? void (0) : __assert_fail ("isPowerOf2_32(RegSz * MemSz * NumElems) && \"Non-power-of-two elements are not custom lowered!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18745, __extension__ __PRETTY_FUNCTION__))
;
18746
18747 // Attempt to load the original value using scalar loads.
18748 // Find the largest scalar type that divides the total loaded size.
18749 MVT SclrLoadTy = MVT::i8;
18750 for (MVT Tp : MVT::integer_valuetypes()) {
18751 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18752 SclrLoadTy = Tp;
18753 }
18754 }
18755
18756 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18757 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18758 (64 <= MemSz))
18759 SclrLoadTy = MVT::f64;
18760
18761 // Calculate the number of scalar loads that we need to perform
18762 // in order to load our vector from memory.
18763 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18764
18765 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&(static_cast <bool> ((Ext != ISD::SEXTLOAD || NumLoads ==
1) && "Can only lower sext loads with a single scalar load!"
) ? void (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18766, __extension__ __PRETTY_FUNCTION__))
18766 "Can only lower sext loads with a single scalar load!")(static_cast <bool> ((Ext != ISD::SEXTLOAD || NumLoads ==
1) && "Can only lower sext loads with a single scalar load!"
) ? void (0) : __assert_fail ("(Ext != ISD::SEXTLOAD || NumLoads == 1) && \"Can only lower sext loads with a single scalar load!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18766, __extension__ __PRETTY_FUNCTION__))
;
18767
18768 unsigned loadRegZize = RegSz;
18769 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18770 loadRegZize = 128;
18771
18772 // If we don't have BWI we won't be able to create the shuffle needed for
18773 // v8i8->v8i64.
18774 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18775 MemVT == MVT::v8i8)
18776 loadRegZize = 128;
18777
18778 // Represent our vector as a sequence of elements which are the
18779 // largest scalar that we can load.
18780 EVT LoadUnitVecVT = EVT::getVectorVT(
18781 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18782
18783 // Represent the data using the same element type that is stored in
18784 // memory. In practice, we ''widen'' MemVT.
18785 EVT WideVecVT =
18786 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18787 loadRegZize / MemVT.getScalarSizeInBits());
18788
18789 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&(static_cast <bool> (WideVecVT.getSizeInBits() == LoadUnitVecVT
.getSizeInBits() && "Invalid vector type") ? void (0)
: __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18790, __extension__ __PRETTY_FUNCTION__))
18790 "Invalid vector type")(static_cast <bool> (WideVecVT.getSizeInBits() == LoadUnitVecVT
.getSizeInBits() && "Invalid vector type") ? void (0)
: __assert_fail ("WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && \"Invalid vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18790, __extension__ __PRETTY_FUNCTION__))
;
18791
18792 // We can't shuffle using an illegal type.
18793 assert(TLI.isTypeLegal(WideVecVT) &&(static_cast <bool> (TLI.isTypeLegal(WideVecVT) &&
"We only lower types that form legal widened vector types") ?
void (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18794, __extension__ __PRETTY_FUNCTION__))
18794 "We only lower types that form legal widened vector types")(static_cast <bool> (TLI.isTypeLegal(WideVecVT) &&
"We only lower types that form legal widened vector types") ?
void (0) : __assert_fail ("TLI.isTypeLegal(WideVecVT) && \"We only lower types that form legal widened vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18794, __extension__ __PRETTY_FUNCTION__))
;
18795
18796 SmallVector<SDValue, 8> Chains;
18797 SDValue Ptr = Ld->getBasePtr();
18798 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18799 TLI.getPointerTy(DAG.getDataLayout()));
18800 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18801
18802 for (unsigned i = 0; i < NumLoads; ++i) {
18803 // Perform a single load.
18804 SDValue ScalarLoad =
18805 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18806 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18807 Chains.push_back(ScalarLoad.getValue(1));
18808 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18809 // another round of DAGCombining.
18810 if (i == 0)
18811 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18812 else
18813 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18814 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18815
18816 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18817 }
18818
18819 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18820
18821 // Bitcast the loaded value to a vector of the original element type, in
18822 // the size of the target vector type.
18823 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18824 unsigned SizeRatio = RegSz / MemSz;
18825
18826 if (Ext == ISD::SEXTLOAD) {
18827 // If we have SSE4.1, we can directly emit a VSEXT node.
18828 if (Subtarget.hasSSE41()) {
18829 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18830 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18831 return Sext;
18832 }
18833
18834 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18835 // lanes.
18836 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&(static_cast <bool> (TLI.isOperationLegalOrCustom(ISD::
SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"
) ? void (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18837, __extension__ __PRETTY_FUNCTION__))
18837 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!")(static_cast <bool> (TLI.isOperationLegalOrCustom(ISD::
SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"
) ? void (0) : __assert_fail ("TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && \"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18837, __extension__ __PRETTY_FUNCTION__))
;
18838
18839 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18840 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18841 return Shuff;
18842 }
18843
18844 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18845 MemVT == MVT::v8i8) {
18846 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
18847 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18848 return Sext;
18849 }
18850
18851 // Redistribute the loaded elements into the different locations.
18852 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18853 for (unsigned i = 0; i != NumElems; ++i)
18854 ShuffleVec[i * SizeRatio] = i;
18855
18856 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18857 DAG.getUNDEF(WideVecVT), ShuffleVec);
18858
18859 // Bitcast to the requested type.
18860 Shuff = DAG.getBitcast(RegVT, Shuff);
18861 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18862 return Shuff;
18863}
18864
18865/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18866/// each of which has no other use apart from the AND / OR.
18867static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18868 Opc = Op.getOpcode();
18869 if (Opc != ISD::OR && Opc != ISD::AND)
18870 return false;
18871 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18872 Op.getOperand(0).hasOneUse() &&
18873 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18874 Op.getOperand(1).hasOneUse());
18875}
18876
18877/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18878/// SETCC node has a single use.
18879static bool isXor1OfSetCC(SDValue Op) {
18880 if (Op.getOpcode() != ISD::XOR)
18881 return false;
18882 if (isOneConstant(Op.getOperand(1)))
18883 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18884 Op.getOperand(0).hasOneUse();
18885 return false;
18886}
18887
18888SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18889 bool addTest = true;
18890 SDValue Chain = Op.getOperand(0);
18891 SDValue Cond = Op.getOperand(1);
18892 SDValue Dest = Op.getOperand(2);
18893 SDLoc dl(Op);
18894 SDValue CC;
18895 bool Inverted = false;
18896
18897 if (Cond.getOpcode() == ISD::SETCC) {
18898 // Check for setcc([su]{add,sub,mul}o == 0).
18899 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18900 isNullConstant(Cond.getOperand(1)) &&
18901 Cond.getOperand(0).getResNo() == 1 &&
18902 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18903 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18904 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18905 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18906 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18907 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18908 Inverted = true;
18909 Cond = Cond.getOperand(0);
18910 } else {
18911 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18912 Cond = NewCond;
18913 }
18914 }
18915#if 0
18916 // FIXME: LowerXALUO doesn't handle these!!
18917 else if (Cond.getOpcode() == X86ISD::ADD ||
18918 Cond.getOpcode() == X86ISD::SUB ||
18919 Cond.getOpcode() == X86ISD::SMUL ||
18920 Cond.getOpcode() == X86ISD::UMUL)
18921 Cond = LowerXALUO(Cond, DAG);
18922#endif
18923
18924 // Look pass (and (setcc_carry (cmp ...)), 1).
18925 if (Cond.getOpcode() == ISD::AND &&
18926 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18927 isOneConstant(Cond.getOperand(1)))
18928 Cond = Cond.getOperand(0);
18929
18930 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18931 // setting operand in place of the X86ISD::SETCC.
18932 unsigned CondOpcode = Cond.getOpcode();
18933 if (CondOpcode == X86ISD::SETCC ||
18934 CondOpcode == X86ISD::SETCC_CARRY) {
18935 CC = Cond.getOperand(0);
18936
18937 SDValue Cmp = Cond.getOperand(1);
18938 unsigned Opc = Cmp.getOpcode();
18939 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18940 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18941 Cond = Cmp;
18942 addTest = false;
18943 } else {
18944 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18945 default: break;
18946 case X86::COND_O:
18947 case X86::COND_B:
18948 // These can only come from an arithmetic instruction with overflow,
18949 // e.g. SADDO, UADDO.
18950 Cond = Cond.getOperand(1);
18951 addTest = false;
18952 break;
18953 }
18954 }
18955 }
18956 CondOpcode = Cond.getOpcode();
18957 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18958 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18959 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18960 Cond.getOperand(0).getValueType() != MVT::i8)) {
18961 SDValue LHS = Cond.getOperand(0);
18962 SDValue RHS = Cond.getOperand(1);
18963 unsigned X86Opcode;
18964 unsigned X86Cond;
18965 SDVTList VTs;
18966 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18967 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18968 // X86ISD::INC).
18969 switch (CondOpcode) {
18970 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18971 case ISD::SADDO:
18972 if (isOneConstant(RHS)) {
18973 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18974 break;
18975 }
18976 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18977 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18978 case ISD::SSUBO:
18979 if (isOneConstant(RHS)) {
18980 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18981 break;
18982 }
18983 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18984 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18985 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18986 default: llvm_unreachable("unexpected overflowing operator")::llvm::llvm_unreachable_internal("unexpected overflowing operator"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 18986)
;
18987 }
18988 if (Inverted)
18989 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18990 if (CondOpcode == ISD::UMULO)
18991 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18992 MVT::i32);
18993 else
18994 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18995
18996 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18997
18998 if (CondOpcode == ISD::UMULO)
18999 Cond = X86Op.getValue(2);
19000 else
19001 Cond = X86Op.getValue(1);
19002
19003 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19004 addTest = false;
19005 } else {
19006 unsigned CondOpc;
19007 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19008 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19009 if (CondOpc == ISD::OR) {
19010 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19011 // two branches instead of an explicit OR instruction with a
19012 // separate test.
19013 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19014 isX86LogicalCmp(Cmp)) {
19015 CC = Cond.getOperand(0).getOperand(0);
19016 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19017 Chain, Dest, CC, Cmp);
19018 CC = Cond.getOperand(1).getOperand(0);
19019 Cond = Cmp;
19020 addTest = false;
19021 }
19022 } else { // ISD::AND
19023 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19024 // two branches instead of an explicit AND instruction with a
19025 // separate test. However, we only do this if this block doesn't
19026 // have a fall-through edge, because this requires an explicit
19027 // jmp when the condition is false.
19028 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19029 isX86LogicalCmp(Cmp) &&
19030 Op.getNode()->hasOneUse()) {
19031 X86::CondCode CCode =
19032 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19033 CCode = X86::GetOppositeBranchCondition(CCode);
19034 CC = DAG.getConstant(CCode, dl, MVT::i8);
19035 SDNode *User = *Op.getNode()->use_begin();
19036 // Look for an unconditional branch following this conditional branch.
19037 // We need this because we need to reverse the successors in order
19038 // to implement FCMP_OEQ.
19039 if (User->getOpcode() == ISD::BR) {
19040 SDValue FalseBB = User->getOperand(1);
19041 SDNode *NewBR =
19042 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19043 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19043, __extension__ __PRETTY_FUNCTION__))
;
19044 (void)NewBR;
19045 Dest = FalseBB;
19046
19047 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19048 Chain, Dest, CC, Cmp);
19049 X86::CondCode CCode =
19050 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19051 CCode = X86::GetOppositeBranchCondition(CCode);
19052 CC = DAG.getConstant(CCode, dl, MVT::i8);
19053 Cond = Cmp;
19054 addTest = false;
19055 }
19056 }
19057 }
19058 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19059 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19060 // It should be transformed during dag combiner except when the condition
19061 // is set by a arithmetics with overflow node.
19062 X86::CondCode CCode =
19063 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19064 CCode = X86::GetOppositeBranchCondition(CCode);
19065 CC = DAG.getConstant(CCode, dl, MVT::i8);
19066 Cond = Cond.getOperand(0).getOperand(1);
19067 addTest = false;
19068 } else if (Cond.getOpcode() == ISD::SETCC &&
19069 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19070 // For FCMP_OEQ, we can emit
19071 // two branches instead of an explicit AND instruction with a
19072 // separate test. However, we only do this if this block doesn't
19073 // have a fall-through edge, because this requires an explicit
19074 // jmp when the condition is false.
19075 if (Op.getNode()->hasOneUse()) {
19076 SDNode *User = *Op.getNode()->use_begin();
19077 // Look for an unconditional branch following this conditional branch.
19078 // We need this because we need to reverse the successors in order
19079 // to implement FCMP_OEQ.
19080 if (User->getOpcode() == ISD::BR) {
19081 SDValue FalseBB = User->getOperand(1);
19082 SDNode *NewBR =
19083 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19084 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19084, __extension__ __PRETTY_FUNCTION__))
;
19085 (void)NewBR;
19086 Dest = FalseBB;
19087
19088 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19089 Cond.getOperand(0), Cond.getOperand(1));
19090 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19091 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19092 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19093 Chain, Dest, CC, Cmp);
19094 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19095 Cond = Cmp;
19096 addTest = false;
19097 }
19098 }
19099 } else if (Cond.getOpcode() == ISD::SETCC &&
19100 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19101 // For FCMP_UNE, we can emit
19102 // two branches instead of an explicit AND instruction with a
19103 // separate test. However, we only do this if this block doesn't
19104 // have a fall-through edge, because this requires an explicit
19105 // jmp when the condition is false.
19106 if (Op.getNode()->hasOneUse()) {
19107 SDNode *User = *Op.getNode()->use_begin();
19108 // Look for an unconditional branch following this conditional branch.
19109 // We need this because we need to reverse the successors in order
19110 // to implement FCMP_UNE.
19111 if (User->getOpcode() == ISD::BR) {
19112 SDValue FalseBB = User->getOperand(1);
19113 SDNode *NewBR =
19114 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19115 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19115, __extension__ __PRETTY_FUNCTION__))
;
19116 (void)NewBR;
19117
19118 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19119 Cond.getOperand(0), Cond.getOperand(1));
19120 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19121 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19122 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19123 Chain, Dest, CC, Cmp);
19124 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19125 Cond = Cmp;
19126 addTest = false;
19127 Dest = FalseBB;
19128 }
19129 }
19130 }
19131 }
19132
19133 if (addTest) {
19134 // Look pass the truncate if the high bits are known zero.
19135 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19136 Cond = Cond.getOperand(0);
19137
19138 // We know the result of AND is compared against zero. Try to match
19139 // it to BT.
19140 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19141 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19142 CC = NewSetCC.getOperand(0);
19143 Cond = NewSetCC.getOperand(1);
19144 addTest = false;
19145 }
19146 }
19147 }
19148
19149 if (addTest) {
19150 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19151 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19152 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19153 }
19154 Cond = ConvertCmpIfNecessary(Cond, DAG);
19155 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19156 Chain, Dest, CC, Cond);
19157}
19158
19159// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19160// Calls to _alloca are needed to probe the stack when allocating more than 4k
19161// bytes in one go. Touching the stack at 4K increments is necessary to ensure
19162// that the guard pages used by the OS virtual memory manager are allocated in
19163// correct sequence.
19164SDValue
19165X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19166 SelectionDAG &DAG) const {
19167 MachineFunction &MF = DAG.getMachineFunction();
19168 bool SplitStack = MF.shouldSplitStack();
19169 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19170 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19171 SplitStack || EmitStackProbe;
19172 SDLoc dl(Op);
19173
19174 // Get the inputs.
19175 SDNode *Node = Op.getNode();
19176 SDValue Chain = Op.getOperand(0);
19177 SDValue Size = Op.getOperand(1);
19178 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19179 EVT VT = Node->getValueType(0);
19180
19181 // Chain the dynamic stack allocation so that it doesn't modify the stack
19182 // pointer when other instructions are using the stack.
19183 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19184
19185 bool Is64Bit = Subtarget.is64Bit();
19186 MVT SPTy = getPointerTy(DAG.getDataLayout());
19187
19188 SDValue Result;
19189 if (!Lower) {
19190 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19191 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19192 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __extension__ __PRETTY_FUNCTION__))
19193 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __extension__ __PRETTY_FUNCTION__))
;
19194
19195 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19196 Chain = SP.getValue(1);
19197 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19198 unsigned StackAlign = TFI.getStackAlignment();
19199 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19200 if (Align > StackAlign)
19201 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19202 DAG.getConstant(-(uint64_t)Align, dl, VT));
19203 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19204 } else if (SplitStack) {
19205 MachineRegisterInfo &MRI = MF.getRegInfo();
19206
19207 if (Is64Bit) {
19208 // The 64 bit implementation of segmented stacks needs to clobber both r10
19209 // r11. This makes it impossible to use it along with nested parameters.
19210 const Function *F = MF.getFunction();
19211 for (const auto &A : F->args()) {
19212 if (A.hasNestAttr())
19213 report_fatal_error("Cannot use segmented stacks with functions that "
19214 "have nested arguments.");
19215 }
19216 }
19217
19218 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19219 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19220 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19221 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19222 DAG.getRegister(Vreg, SPTy));
19223 } else {
19224 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19225 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19226 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19227
19228 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19229 unsigned SPReg = RegInfo->getStackRegister();
19230 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19231 Chain = SP.getValue(1);
19232
19233 if (Align) {
19234 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19235 DAG.getConstant(-(uint64_t)Align, dl, VT));
19236 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19237 }
19238
19239 Result = SP;
19240 }
19241
19242 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19243 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19244
19245 SDValue Ops[2] = {Result, Chain};
19246 return DAG.getMergeValues(Ops, dl);
19247}
19248
19249SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19250 MachineFunction &MF = DAG.getMachineFunction();
19251 auto PtrVT = getPointerTy(MF.getDataLayout());
19252 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19253
19254 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19255 SDLoc DL(Op);
19256
19257 if (!Subtarget.is64Bit() ||
19258 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
19259 // vastart just stores the address of the VarArgsFrameIndex slot into the
19260 // memory location argument.
19261 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19262 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19263 MachinePointerInfo(SV));
19264 }
19265
19266 // __va_list_tag:
19267 // gp_offset (0 - 6 * 8)
19268 // fp_offset (48 - 48 + 8 * 16)
19269 // overflow_arg_area (point to parameters coming in memory).
19270 // reg_save_area
19271 SmallVector<SDValue, 8> MemOps;
19272 SDValue FIN = Op.getOperand(1);
19273 // Store gp_offset
19274 SDValue Store = DAG.getStore(
19275 Op.getOperand(0), DL,
19276 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19277 MachinePointerInfo(SV));
19278 MemOps.push_back(Store);
19279
19280 // Store fp_offset
19281 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19282 Store = DAG.getStore(
19283 Op.getOperand(0), DL,
19284 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19285 MachinePointerInfo(SV, 4));
19286 MemOps.push_back(Store);
19287
19288 // Store ptr to overflow_arg_area
19289 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19290 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19291 Store =
19292 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19293 MemOps.push_back(Store);
19294
19295 // Store ptr to reg_save_area.
19296 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19297 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19298 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19299 Store = DAG.getStore(
19300 Op.getOperand(0), DL, RSFIN, FIN,
19301 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19302 MemOps.push_back(Store);
19303 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19304}
19305
19306SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19307 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19308, __extension__ __PRETTY_FUNCTION__))
19308 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19308, __extension__ __PRETTY_FUNCTION__))
;
19309 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19309, __extension__ __PRETTY_FUNCTION__))
;
19310
19311 MachineFunction &MF = DAG.getMachineFunction();
19312 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
19313 // The Win64 ABI uses char* instead of a structure.
19314 return DAG.expandVAArg(Op.getNode());
19315
19316 SDValue Chain = Op.getOperand(0);
19317 SDValue SrcPtr = Op.getOperand(1);
19318 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19319 unsigned Align = Op.getConstantOperandVal(3);
19320 SDLoc dl(Op);
19321
19322 EVT ArgVT = Op.getNode()->getValueType(0);
19323 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19324 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19325 uint8_t ArgMode;
19326
19327 // Decide which area this value should be read from.
19328 // TODO: Implement the AMD64 ABI in its entirety. This simple
19329 // selection mechanism works only for the basic types.
19330 if (ArgVT == MVT::f80) {
19331 llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19331)
;
19332 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19333 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19334 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19335 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19336 } else {
19337 llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19337)
;
19338 }
19339
19340 if (ArgMode == 2) {
19341 // Sanity Check: Make sure using fp_offset makes sense.
19342 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__))
19343 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__))
19344 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19344, __extension__ __PRETTY_FUNCTION__))
;
19345 }
19346
19347 // Insert VAARG_64 node into the DAG
19348 // VAARG_64 returns two values: Variable Argument Address, Chain
19349 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19350 DAG.getConstant(ArgMode, dl, MVT::i8),
19351 DAG.getConstant(Align, dl, MVT::i32)};
19352 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19353 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19354 VTs, InstOps, MVT::i64,
19355 MachinePointerInfo(SV),
19356 /*Align=*/0,
19357 /*Volatile=*/false,
19358 /*ReadMem=*/true,
19359 /*WriteMem=*/true);
19360 Chain = VAARG.getValue(1);
19361
19362 // Load the next argument and return it
19363 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19364}
19365
19366static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19367 SelectionDAG &DAG) {
19368 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19369 // where a va_list is still an i8*.
19370 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19370, __extension__ __PRETTY_FUNCTION__))
;
19371 if (Subtarget.isCallingConvWin64(
19372 DAG.getMachineFunction().getFunction()->getCallingConv()))
19373 // Probably a Win64 va_copy.
19374 return DAG.expandVACopy(Op.getNode());
19375
19376 SDValue Chain = Op.getOperand(0);
19377 SDValue DstPtr = Op.getOperand(1);
19378 SDValue SrcPtr = Op.getOperand(2);
19379 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19380 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19381 SDLoc DL(Op);
19382
19383 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19384 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19385 false, false,
19386 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19387}
19388
19389/// Handle vector element shifts where the shift amount is a constant.
19390/// Takes immediate version of shift as input.
19391static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19392 SDValue SrcOp, uint64_t ShiftAmt,
19393 SelectionDAG &DAG) {
19394 MVT ElementType = VT.getVectorElementType();
19395
19396 // Bitcast the source vector to the output type, this is mainly necessary for
19397 // vXi8/vXi64 shifts.
19398 if (VT != SrcOp.getSimpleValueType())
19399 SrcOp = DAG.getBitcast(VT, SrcOp);
19400
19401 // Fold this packed shift into its first operand if ShiftAmt is 0.
19402 if (ShiftAmt == 0)
19403 return SrcOp;
19404
19405 // Check for ShiftAmt >= element width
19406 if (ShiftAmt >= ElementType.getSizeInBits()) {
19407 if (Opc == X86ISD::VSRAI)
19408 ShiftAmt = ElementType.getSizeInBits() - 1;
19409 else
19410 return DAG.getConstant(0, dl, VT);
19411 }
19412
19413 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19414, __extension__ __PRETTY_FUNCTION__))
19414 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19414, __extension__ __PRETTY_FUNCTION__))
;
19415
19416 // Fold this packed vector shift into a build vector if SrcOp is a
19417 // vector of Constants or UNDEFs.
19418 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19419 SmallVector<SDValue, 8> Elts;
19420 unsigned NumElts = SrcOp->getNumOperands();
19421 ConstantSDNode *ND;
19422
19423 switch(Opc) {
19424 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19424)
;
19425 case X86ISD::VSHLI:
19426 for (unsigned i=0; i!=NumElts; ++i) {
19427 SDValue CurrentOp = SrcOp->getOperand(i);
19428 if (CurrentOp->isUndef()) {
19429 Elts.push_back(CurrentOp);
19430 continue;
19431 }
19432 ND = cast<ConstantSDNode>(CurrentOp);
19433 const APInt &C = ND->getAPIntValue();
19434 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19435 }
19436 break;
19437 case X86ISD::VSRLI:
19438 for (unsigned i=0; i!=NumElts; ++i) {
19439 SDValue CurrentOp = SrcOp->getOperand(i);
19440 if (CurrentOp->isUndef()) {
19441 Elts.push_back(CurrentOp);
19442 continue;
19443 }
19444 ND = cast<ConstantSDNode>(CurrentOp);
19445 const APInt &C = ND->getAPIntValue();
19446 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19447 }
19448 break;
19449 case X86ISD::VSRAI:
19450 for (unsigned i=0; i!=NumElts; ++i) {
19451 SDValue CurrentOp = SrcOp->getOperand(i);
19452 if (CurrentOp->isUndef()) {
19453 Elts.push_back(CurrentOp);
19454 continue;
19455 }
19456 ND = cast<ConstantSDNode>(CurrentOp);
19457 const APInt &C = ND->getAPIntValue();
19458 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19459 }
19460 break;
19461 }
19462
19463 return DAG.getBuildVector(VT, dl, Elts);
19464 }
19465
19466 return DAG.getNode(Opc, dl, VT, SrcOp,
19467 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19468}
19469
19470/// Handle vector element shifts where the shift amount may or may not be a
19471/// constant. Takes immediate version of shift as input.
19472static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19473 SDValue SrcOp, SDValue ShAmt,
19474 const X86Subtarget &Subtarget,
19475 SelectionDAG &DAG) {
19476 MVT SVT = ShAmt.getSimpleValueType();
19477 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(static_cast <bool> ((SVT == MVT::i32 || SVT == MVT::i64
) && "Unexpected value type!") ? void (0) : __assert_fail
("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19477, __extension__ __PRETTY_FUNCTION__))
;
19478
19479 // Catch shift-by-constant.
19480 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19481 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19482 CShAmt->getZExtValue(), DAG);
19483
19484 // Change opcode to non-immediate version
19485 switch (Opc) {
19486 default: llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19486)
;
19487 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19488 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19489 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19490 }
19491
19492 // Need to build a vector containing shift amount.
19493 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19494 // +=================+============+=======================================+
19495 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19496 // +=================+============+=======================================+
19497 // | i64 | Yes, No | Use ShAmt as lowest elt |
19498 // | i32 | Yes | zero-extend in-reg |
19499 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19500 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19501 // +=================+============+=======================================+
19502
19503 if (SVT == MVT::i64)
19504 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19505 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19506 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19507 ShAmt = ShAmt.getOperand(0);
19508 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19509 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19510 } else if (Subtarget.hasSSE41() &&
19511 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19512 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19513 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19514 } else {
19515 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19516 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19517 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19518 }
19519
19520 // The return type has to be a 128-bit type with the same element
19521 // type as the input type.
19522 MVT EltVT = VT.getVectorElementType();
19523 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19524
19525 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19526 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19527}
19528
19529/// \brief Return Mask with the necessary casting or extending
19530/// for \p Mask according to \p MaskVT when lowering masking intrinsics
19531static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19532 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19533 const SDLoc &dl) {
19534
19535 if (isAllOnesConstant(Mask))
19536 return DAG.getConstant(1, dl, MaskVT);
19537 if (X86::isZeroNode(Mask))
19538 return DAG.getConstant(0, dl, MaskVT);
19539
19540 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19541 // Mask should be extended
19542 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19543 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19544 }
19545
19546 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19547 if (MaskVT == MVT::v64i1) {
19548 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19548, __extension__ __PRETTY_FUNCTION__))
;
19549 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19550 SDValue Lo, Hi;
19551 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19552 DAG.getConstant(0, dl, MVT::i32));
19553 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19554 DAG.getConstant(1, dl, MVT::i32));
19555
19556 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19557 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19558
19559 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19560 } else {
19561 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19562 // and bitcast.
19563 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19564 return DAG.getBitcast(MaskVT,
19565 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19566 }
19567
19568 } else {
19569 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19570 Mask.getSimpleValueType().getSizeInBits());
19571 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19572 // are extracted by EXTRACT_SUBVECTOR.
19573 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19574 DAG.getBitcast(BitcastVT, Mask),
19575 DAG.getIntPtrConstant(0, dl));
19576 }
19577}
19578
19579/// \brief Return (and \p Op, \p Mask) for compare instructions or
19580/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19581/// necessary casting or extending for \p Mask when lowering masking intrinsics
19582static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19583 SDValue PreservedSrc,
19584 const X86Subtarget &Subtarget,
19585 SelectionDAG &DAG) {
19586 MVT VT = Op.getSimpleValueType();
19587 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19588 unsigned OpcodeSelect = ISD::VSELECT;
19589 SDLoc dl(Op);
19590
19591 if (isAllOnesConstant(Mask))
19592 return Op;
19593
19594 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19595
19596 switch (Op.getOpcode()) {
19597 default: break;
19598 case X86ISD::CMPM:
19599 case X86ISD::CMPM_RND:
19600 case X86ISD::CMPMU:
19601 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19602 case X86ISD::VFPCLASS:
19603 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19604 case X86ISD::VTRUNC:
19605 case X86ISD::VTRUNCS:
19606 case X86ISD::VTRUNCUS:
19607 case X86ISD::CVTPS2PH:
19608 // We can't use ISD::VSELECT here because it is not always "Legal"
19609 // for the destination type. For example vpmovqb require only AVX512
19610 // and vselect that can operate on byte element type require BWI
19611 OpcodeSelect = X86ISD::SELECT;
19612 break;
19613 }
19614 if (PreservedSrc.isUndef())
19615 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19616 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19617}
19618
19619/// \brief Creates an SDNode for a predicated scalar operation.
19620/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19621/// The mask is coming as MVT::i8 and it should be transformed
19622/// to MVT::v1i1 while lowering masking intrinsics.
19623/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19624/// "X86select" instead of "vselect". We just can't create the "vselect" node
19625/// for a scalar instruction.
19626static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19627 SDValue PreservedSrc,
19628 const X86Subtarget &Subtarget,
19629 SelectionDAG &DAG) {
19630
19631 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19632 if (MaskConst->getZExtValue() & 0x1)
19633 return Op;
19634
19635 MVT VT = Op.getSimpleValueType();
19636 SDLoc dl(Op);
19637
19638 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19639 if (Op.getOpcode() == X86ISD::FSETCCM ||
19640 Op.getOpcode() == X86ISD::FSETCCM_RND)
19641 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19642 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19643 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19644
19645 if (PreservedSrc.isUndef())
19646 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19647 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19648}
19649
19650static int getSEHRegistrationNodeSize(const Function *Fn) {
19651 if (!Fn->hasPersonalityFn())
19652 report_fatal_error(
19653 "querying registration node size for function without personality");
19654 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19655 // WinEHStatePass for the full struct definition.
19656 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19657 case EHPersonality::MSVC_X86SEH: return 24;
19658 case EHPersonality::MSVC_CXX: return 16;
19659 default: break;
19660 }
19661 report_fatal_error(
19662 "can only recover FP for 32-bit MSVC EH personality functions");
19663}
19664
19665/// When the MSVC runtime transfers control to us, either to an outlined
19666/// function or when returning to a parent frame after catching an exception, we
19667/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19668/// Here's the math:
19669/// RegNodeBase = EntryEBP - RegNodeSize
19670/// ParentFP = RegNodeBase - ParentFrameOffset
19671/// Subtracting RegNodeSize takes us to the offset of the registration node, and
19672/// subtracting the offset (negative on x86) takes us back to the parent FP.
19673static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19674 SDValue EntryEBP) {
19675 MachineFunction &MF = DAG.getMachineFunction();
19676 SDLoc dl;
19677
19678 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19679 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19680
19681 // It's possible that the parent function no longer has a personality function
19682 // if the exceptional code was optimized away, in which case we just return
19683 // the incoming EBP.
19684 if (!Fn->hasPersonalityFn())
19685 return EntryEBP;
19686
19687 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19688 // registration, or the .set_setframe offset.
19689 MCSymbol *OffsetSym =
19690 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19691 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19692 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19693 SDValue ParentFrameOffset =
19694 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19695
19696 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19697 // prologue to RBP in the parent function.
19698 const X86Subtarget &Subtarget =
19699 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19700 if (Subtarget.is64Bit())
19701 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19702
19703 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19704 // RegNodeBase = EntryEBP - RegNodeSize
19705 // ParentFP = RegNodeBase - ParentFrameOffset
19706 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19707 DAG.getConstant(RegNodeSize, dl, PtrVT));
19708 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19709}
19710
19711SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19712 SelectionDAG &DAG) const {
19713 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19714 auto isRoundModeCurDirection = [](SDValue Rnd) {
19715 if (!isa<ConstantSDNode>(Rnd))
19716 return false;
19717
19718 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19719 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19720 };
19721
19722 SDLoc dl(Op);
19723 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19724 MVT VT = Op.getSimpleValueType();
19725 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19726 if (IntrData) {
19727 switch(IntrData->Type) {
19728 case INTR_TYPE_1OP:
19729 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19730 case INTR_TYPE_2OP:
19731 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19732 Op.getOperand(2));
19733 case INTR_TYPE_3OP:
19734 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19735 Op.getOperand(2), Op.getOperand(3));
19736 case INTR_TYPE_4OP:
19737 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19738 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19739 case INTR_TYPE_1OP_MASK_RM: {
19740 SDValue Src = Op.getOperand(1);
19741 SDValue PassThru = Op.getOperand(2);
19742 SDValue Mask = Op.getOperand(3);
19743 SDValue RoundingMode;
19744 // We always add rounding mode to the Node.
19745 // If the rounding mode is not specified, we add the
19746 // "current direction" mode.
19747 if (Op.getNumOperands() == 4)
19748 RoundingMode =
19749 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19750 else
19751 RoundingMode = Op.getOperand(4);
19752 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!")(static_cast <bool> (IntrData->Opc1 == 0 && "Unexpected second opcode!"
) ? void (0) : __assert_fail ("IntrData->Opc1 == 0 && \"Unexpected second opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19752, __extension__ __PRETTY_FUNCTION__))
;
19753 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19754 RoundingMode),
19755 Mask, PassThru, Subtarget, DAG);
19756 }
19757 case INTR_TYPE_1OP_MASK: {
19758 SDValue Src = Op.getOperand(1);
19759 SDValue PassThru = Op.getOperand(2);
19760 SDValue Mask = Op.getOperand(3);
19761 // We add rounding mode to the Node when
19762 // - RM Opcode is specified and
19763 // - RM is not "current direction".
19764 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19765 if (IntrWithRoundingModeOpcode != 0) {
19766 SDValue Rnd = Op.getOperand(4);
19767 if (!isRoundModeCurDirection(Rnd)) {
19768 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19769 dl, Op.getValueType(),
19770 Src, Rnd),
19771 Mask, PassThru, Subtarget, DAG);
19772 }
19773 }
19774 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19775 Mask, PassThru, Subtarget, DAG);
19776 }
19777 case INTR_TYPE_SCALAR_MASK: {
19778 SDValue Src1 = Op.getOperand(1);
19779 SDValue Src2 = Op.getOperand(2);
19780 SDValue passThru = Op.getOperand(3);
19781 SDValue Mask = Op.getOperand(4);
19782 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19783 // There are 2 kinds of intrinsics in this group:
19784 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19785 // (2) With rounding mode and sae - 7 operands.
19786 bool HasRounding = IntrWithRoundingModeOpcode != 0;
19787 if (Op.getNumOperands() == (5U + HasRounding)) {
19788 if (HasRounding) {
19789 SDValue Rnd = Op.getOperand(5);
19790 if (!isRoundModeCurDirection(Rnd))
19791 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19792 dl, VT, Src1, Src2, Rnd),
19793 Mask, passThru, Subtarget, DAG);
19794 }
19795 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19796 Src2),
19797 Mask, passThru, Subtarget, DAG);
19798 }
19799
19800 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __extension__ __PRETTY_FUNCTION__))
19801 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __extension__ __PRETTY_FUNCTION__))
;
19802 SDValue RoundingMode = Op.getOperand(5);
19803 if (HasRounding) {
19804 SDValue Sae = Op.getOperand(6);
19805 if (!isRoundModeCurDirection(Sae))
19806 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19807 dl, VT, Src1, Src2,
19808 RoundingMode, Sae),
19809 Mask, passThru, Subtarget, DAG);
19810 }
19811 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19812 Src2, RoundingMode),
19813 Mask, passThru, Subtarget, DAG);
19814 }
19815 case INTR_TYPE_SCALAR_MASK_RM: {
19816 SDValue Src1 = Op.getOperand(1);
19817 SDValue Src2 = Op.getOperand(2);
19818 SDValue Src0 = Op.getOperand(3);
19819 SDValue Mask = Op.getOperand(4);
19820 // There are 2 kinds of intrinsics in this group:
19821 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19822 // (2) With rounding mode and sae - 7 operands.
19823 if (Op.getNumOperands() == 6) {
19824 SDValue Sae = Op.getOperand(5);
19825 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19826 Sae),
19827 Mask, Src0, Subtarget, DAG);
19828 }
19829 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == 7 &&
"Unexpected intrinsic form") ? void (0) : __assert_fail ("Op.getNumOperands() == 7 && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 19829, __extension__ __PRETTY_FUNCTION__))
;
19830 SDValue RoundingMode = Op.getOperand(5);
19831 SDValue Sae = Op.getOperand(6);
19832 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19833 RoundingMode, Sae),
19834 Mask, Src0, Subtarget, DAG);
19835 }
19836 case INTR_TYPE_2OP_MASK:
19837 case INTR_TYPE_2OP_IMM8_MASK: {
19838 SDValue Src1 = Op.getOperand(1);
19839 SDValue Src2 = Op.getOperand(2);
19840 SDValue PassThru = Op.getOperand(3);
19841 SDValue Mask = Op.getOperand(4);
19842
19843 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19844 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19845
19846 // We specify 2 possible opcodes for intrinsics with rounding modes.
19847 // First, we check if the intrinsic may have non-default rounding mode,
19848 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19849 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19850 if (IntrWithRoundingModeOpcode != 0) {
19851 SDValue Rnd = Op.getOperand(5);
19852 if (!isRoundModeCurDirection(Rnd)) {
19853 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19854 dl, Op.getValueType(),
19855 Src1, Src2, Rnd),
19856 Mask, PassThru, Subtarget, DAG);
19857 }
19858 }
19859 // TODO: Intrinsics should have fast-math-flags to propagate.
19860 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19861 Mask, PassThru, Subtarget, DAG);
19862 }
19863 case INTR_TYPE_2OP_MASK_RM: {
19864 SDValue Src1 = Op.getOperand(1);
19865 SDValue Src2 = Op.getOperand(2);
19866 SDValue PassThru = Op.getOperand(3);
19867 SDValue Mask = Op.getOperand(4);
19868 // We specify 2 possible modes for intrinsics, with/without rounding
19869 // modes.
19870 // First, we check if the intrinsic have rounding mode (6 operands),
19871 // if not, we set rounding mode to "current".
19872 SDValue Rnd;
19873 if (Op.getNumOperands() == 6)
19874 Rnd = Op.getOperand(5);
19875 else
19876 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19877 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19878 Src1, Src2, Rnd),
19879 Mask, PassThru, Subtarget, DAG);
19880 }
19881 case INTR_TYPE_3OP_SCALAR_MASK: {
19882 SDValue Src1 = Op.getOperand(1);
19883 SDValue Src2 = Op.getOperand(2);
19884 SDValue Src3 = Op.getOperand(3);
19885 SDValue PassThru = Op.getOperand(4);
19886 SDValue Mask = Op.getOperand(5);
19887
19888 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19889 if (IntrWithRoundingModeOpcode != 0) {
19890 SDValue Rnd = Op.getOperand(6);
19891 if (!isRoundModeCurDirection(Rnd))
19892 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19893 dl, VT, Src1, Src2, Src3, Rnd),
19894 Mask, PassThru, Subtarget, DAG);
19895 }
19896 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19897 Src2, Src3),
19898 Mask, PassThru, Subtarget, DAG);
19899 }
19900 case INTR_TYPE_3OP_MASK_RM: {
19901 SDValue Src1 = Op.getOperand(1);
19902 SDValue Src2 = Op.getOperand(2);
19903 SDValue Imm = Op.getOperand(3);
19904 SDValue PassThru = Op.getOperand(4);
19905 SDValue Mask = Op.getOperand(5);
19906 // We specify 2 possible modes for intrinsics, with/without rounding
19907 // modes.
19908 // First, we check if the intrinsic have rounding mode (7 operands),
19909 // if not, we set rounding mode to "current".
19910 SDValue Rnd;
19911 if (Op.getNumOperands() == 7)
19912 Rnd = Op.getOperand(6);
19913 else
19914 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19915 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19916 Src1, Src2, Imm, Rnd),
19917 Mask, PassThru, Subtarget, DAG);
19918 }
19919 case INTR_TYPE_3OP_IMM8_MASK:
19920 case INTR_TYPE_3OP_MASK: {
19921 SDValue Src1 = Op.getOperand(1);
19922 SDValue Src2 = Op.getOperand(2);
19923 SDValue Src3 = Op.getOperand(3);
19924 SDValue PassThru = Op.getOperand(4);
19925 SDValue Mask = Op.getOperand(5);
19926
19927 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19928 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19929
19930 // We specify 2 possible opcodes for intrinsics with rounding modes.
19931 // First, we check if the intrinsic may have non-default rounding mode,
19932 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19933 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19934 if (IntrWithRoundingModeOpcode != 0) {
19935 SDValue Rnd = Op.getOperand(6);
19936 if (!isRoundModeCurDirection(Rnd)) {
19937 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19938 dl, Op.getValueType(),
19939 Src1, Src2, Src3, Rnd),
19940 Mask, PassThru, Subtarget, DAG);
19941 }
19942 }
19943 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19944 Src1, Src2, Src3),
19945 Mask, PassThru, Subtarget, DAG);
19946 }
19947 case VPERM_2OP_MASK : {
19948 SDValue Src1 = Op.getOperand(1);
19949 SDValue Src2 = Op.getOperand(2);
19950 SDValue PassThru = Op.getOperand(3);
19951 SDValue Mask = Op.getOperand(4);
19952
19953 // Swap Src1 and Src2 in the node creation
19954 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19955 Mask, PassThru, Subtarget, DAG);
19956 }
19957 case VPERM_3OP_MASKZ:
19958 case VPERM_3OP_MASK:{
19959 MVT VT = Op.getSimpleValueType();
19960 // Src2 is the PassThru
19961 SDValue Src1 = Op.getOperand(1);
19962 // PassThru needs to be the same type as the destination in order
19963 // to pattern match correctly.
19964 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19965 SDValue Src3 = Op.getOperand(3);
19966 SDValue Mask = Op.getOperand(4);
19967 SDValue PassThru = SDValue();
19968
19969 // set PassThru element
19970 if (IntrData->Type == VPERM_3OP_MASKZ)
19971 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19972 else
19973 PassThru = Src2;
19974
19975 // Swap Src1 and Src2 in the node creation
19976 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19977 dl, Op.getValueType(),
19978 Src2, Src1, Src3),
19979 Mask, PassThru, Subtarget, DAG);
19980 }
19981 case FMA_OP_MASK3:
19982 case FMA_OP_MASKZ:
19983 case FMA_OP_MASK: {
19984 SDValue Src1 = Op.getOperand(1);
19985 SDValue Src2 = Op.getOperand(2);
19986 SDValue Src3 = Op.getOperand(3);
19987 SDValue Mask = Op.getOperand(4);
19988 MVT VT = Op.getSimpleValueType();
19989 SDValue PassThru = SDValue();
19990
19991 // set PassThru element
19992 if (IntrData->Type == FMA_OP_MASKZ)
19993 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19994 else if (IntrData->Type == FMA_OP_MASK3)
19995 PassThru = Src3;
19996 else
19997 PassThru = Src1;
19998
19999 // We specify 2 possible opcodes for intrinsics with rounding modes.
20000 // First, we check if the intrinsic may have non-default rounding mode,
20001 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20002 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20003 if (IntrWithRoundingModeOpcode != 0) {
20004 SDValue Rnd = Op.getOperand(5);
20005 if (!isRoundModeCurDirection(Rnd))
20006 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20007 dl, Op.getValueType(),
20008 Src1, Src2, Src3, Rnd),
20009 Mask, PassThru, Subtarget, DAG);
20010 }
20011 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20012 dl, Op.getValueType(),
20013 Src1, Src2, Src3),
20014 Mask, PassThru, Subtarget, DAG);
20015 }
20016 case FMA_OP_SCALAR_MASK:
20017 case FMA_OP_SCALAR_MASK3:
20018 case FMA_OP_SCALAR_MASKZ: {
20019 SDValue Src1 = Op.getOperand(1);
20020 SDValue Src2 = Op.getOperand(2);
20021 SDValue Src3 = Op.getOperand(3);
20022 SDValue Mask = Op.getOperand(4);
20023 MVT VT = Op.getSimpleValueType();
20024 SDValue PassThru = SDValue();
20025
20026 // set PassThru element
20027 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20028 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20029 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20030 PassThru = Src3;
20031 else
20032 PassThru = Src1;
20033
20034 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20035 if (IntrWithRoundingModeOpcode != 0) {
20036 SDValue Rnd = Op.getOperand(5);
20037 if (!isRoundModeCurDirection(Rnd))
20038 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20039 Op.getValueType(), Src1, Src2,
20040 Src3, Rnd),
20041 Mask, PassThru, Subtarget, DAG);
20042 }
20043
20044 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20045 Op.getValueType(), Src1, Src2,
20046 Src3),
20047 Mask, PassThru, Subtarget, DAG);
20048 }
20049 case IFMA_OP_MASKZ:
20050 case IFMA_OP_MASK: {
20051 SDValue Src1 = Op.getOperand(1);
20052 SDValue Src2 = Op.getOperand(2);
20053 SDValue Src3 = Op.getOperand(3);
20054 SDValue Mask = Op.getOperand(4);
20055 MVT VT = Op.getSimpleValueType();
20056 SDValue PassThru = Src1;
20057
20058 // set PassThru element
20059 if (IntrData->Type == IFMA_OP_MASKZ)
20060 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20061
20062 // Node we need to swizzle the operands to pass the multiply operands
20063 // first.
20064 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20065 dl, Op.getValueType(),
20066 Src2, Src3, Src1),
20067 Mask, PassThru, Subtarget, DAG);
20068 }
20069 case TERLOG_OP_MASK:
20070 case TERLOG_OP_MASKZ: {
20071 SDValue Src1 = Op.getOperand(1);
20072 SDValue Src2 = Op.getOperand(2);
20073 SDValue Src3 = Op.getOperand(3);
20074 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20075 SDValue Mask = Op.getOperand(5);
20076 MVT VT = Op.getSimpleValueType();
20077 SDValue PassThru = Src1;
20078 // Set PassThru element.
20079 if (IntrData->Type == TERLOG_OP_MASKZ)
20080 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20081
20082 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20083 Src1, Src2, Src3, Src4),
20084 Mask, PassThru, Subtarget, DAG);
20085 }
20086 case CVTPD2PS:
20087 // ISD::FP_ROUND has a second argument that indicates if the truncation
20088 // does not change the value. Set it to 0 since it can change.
20089 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20090 DAG.getIntPtrConstant(0, dl));
20091 case CVTPD2PS_MASK: {
20092 SDValue Src = Op.getOperand(1);
20093 SDValue PassThru = Op.getOperand(2);
20094 SDValue Mask = Op.getOperand(3);
20095 // We add rounding mode to the Node when
20096 // - RM Opcode is specified and
20097 // - RM is not "current direction".
20098 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20099 if (IntrWithRoundingModeOpcode != 0) {
20100 SDValue Rnd = Op.getOperand(4);
20101 if (!isRoundModeCurDirection(Rnd)) {
20102 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20103 dl, Op.getValueType(),
20104 Src, Rnd),
20105 Mask, PassThru, Subtarget, DAG);
20106 }
20107 }
20108 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!")(static_cast <bool> (IntrData->Opc0 == ISD::FP_ROUND
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"IntrData->Opc0 == ISD::FP_ROUND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20108, __extension__ __PRETTY_FUNCTION__))
;
20109 // ISD::FP_ROUND has a second argument that indicates if the truncation
20110 // does not change the value. Set it to 0 since it can change.
20111 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20112 DAG.getIntPtrConstant(0, dl)),
20113 Mask, PassThru, Subtarget, DAG);
20114 }
20115 case FPCLASS: {
20116 // FPclass intrinsics with mask
20117 SDValue Src1 = Op.getOperand(1);
20118 MVT VT = Src1.getSimpleValueType();
20119 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20120 SDValue Imm = Op.getOperand(2);
20121 SDValue Mask = Op.getOperand(3);
20122 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20123 Mask.getSimpleValueType().getSizeInBits());
20124 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20125 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
20126 DAG.getConstant(0, dl, MaskVT),
20127 Subtarget, DAG);
20128 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20129 DAG.getUNDEF(BitcastVT), FPclassMask,
20130 DAG.getIntPtrConstant(0, dl));
20131 return DAG.getBitcast(Op.getValueType(), Res);
20132 }
20133 case FPCLASSS: {
20134 SDValue Src1 = Op.getOperand(1);
20135 SDValue Imm = Op.getOperand(2);
20136 SDValue Mask = Op.getOperand(3);
20137 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20138 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
20139 DAG.getConstant(0, dl, MVT::i1), Subtarget, DAG);
20140 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
20141 DAG.getIntPtrConstant(0, dl));
20142 }
20143 case CMP_MASK:
20144 case CMP_MASK_CC: {
20145 // Comparison intrinsics with masks.
20146 // Example of transformation:
20147 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20148 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20149 // (i8 (bitcast
20150 // (v8i1 (insert_subvector undef,
20151 // (v2i1 (and (PCMPEQM %a, %b),
20152 // (extract_subvector
20153 // (v8i1 (bitcast %mask)), 0))), 0))))
20154 MVT VT = Op.getOperand(1).getSimpleValueType();
20155 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20156 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20157 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20158 Mask.getSimpleValueType().getSizeInBits());
20159 SDValue Cmp;
20160 if (IntrData->Type == CMP_MASK_CC) {
20161 SDValue CC = Op.getOperand(3);
20162 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20163 // We specify 2 possible opcodes for intrinsics with rounding modes.
20164 // First, we check if the intrinsic may have non-default rounding mode,
20165 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20166 if (IntrData->Opc1 != 0) {
20167 SDValue Rnd = Op.getOperand(5);
20168 if (!isRoundModeCurDirection(Rnd))
20169 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20170 Op.getOperand(2), CC, Rnd);
20171 }
20172 //default rounding mode
20173 if(!Cmp.getNode())
20174 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20175 Op.getOperand(2), CC);
20176
20177 } else {
20178 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!")(static_cast <bool> (IntrData->Type == CMP_MASK &&
"Unexpected intrinsic type!") ? void (0) : __assert_fail ("IntrData->Type == CMP_MASK && \"Unexpected intrinsic type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20178, __extension__ __PRETTY_FUNCTION__))
;
20179 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20180 Op.getOperand(2));
20181 }
20182 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
20183 DAG.getConstant(0, dl, MaskVT),
20184 Subtarget, DAG);
20185 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20186 DAG.getUNDEF(BitcastVT), CmpMask,
20187 DAG.getIntPtrConstant(0, dl));
20188 return DAG.getBitcast(Op.getValueType(), Res);
20189 }
20190 case CMP_MASK_SCALAR_CC: {
20191 SDValue Src1 = Op.getOperand(1);
20192 SDValue Src2 = Op.getOperand(2);
20193 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20194 SDValue Mask = Op.getOperand(4);
20195
20196 SDValue Cmp;
20197 if (IntrData->Opc1 != 0) {
20198 SDValue Rnd = Op.getOperand(5);
20199 if (!isRoundModeCurDirection(Rnd))
20200 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20201 }
20202 //default rounding mode
20203 if(!Cmp.getNode())
20204 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20205
20206 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
20207 DAG.getConstant(0, dl, MVT::i1),
20208 Subtarget, DAG);
20209 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
20210 DAG.getIntPtrConstant(0, dl));
20211 }
20212 case COMI: { // Comparison intrinsics
20213 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20214 SDValue LHS = Op.getOperand(1);
20215 SDValue RHS = Op.getOperand(2);
20216 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20217 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20218 SDValue SetCC;
20219 switch (CC) {
20220 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20221 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20222 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20223 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20224 break;
20225 }
20226 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20227 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20228 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20229 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20230 break;
20231 }
20232 case ISD::SETGT: // (CF = 0 and ZF = 0)
20233 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20234 break;
20235 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20236 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20237 break;
20238 }
20239 case ISD::SETGE: // CF = 0
20240 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20241 break;
20242 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20243 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20244 break;
20245 default:
20246 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20246)
;
20247 }
20248 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20249 }
20250 case COMI_RM: { // Comparison intrinsics with Sae
20251 SDValue LHS = Op.getOperand(1);
20252 SDValue RHS = Op.getOperand(2);
20253 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20254 SDValue Sae = Op.getOperand(4);
20255
20256 SDValue FCmp;
20257 if (isRoundModeCurDirection(Sae))
20258 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20259 DAG.getConstant(CondVal, dl, MVT::i8));
20260 else
20261 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20262 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20263 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
20264 DAG.getIntPtrConstant(0, dl));
20265 }
20266 case VSHIFT:
20267 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20268 Op.getOperand(1), Op.getOperand(2), Subtarget,
20269 DAG);
20270 case COMPRESS_EXPAND_IN_REG: {
20271 SDValue Mask = Op.getOperand(3);
20272 SDValue DataToCompress = Op.getOperand(1);
20273 SDValue PassThru = Op.getOperand(2);
20274 if (isAllOnesConstant(Mask)) // return data as is
20275 return Op.getOperand(1);
20276
20277 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20278 DataToCompress),
20279 Mask, PassThru, Subtarget, DAG);
20280 }
20281 case BROADCASTM: {
20282 SDValue Mask = Op.getOperand(1);
20283 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20284 Mask.getSimpleValueType().getSizeInBits());
20285 Mask = DAG.getBitcast(MaskVT, Mask);
20286 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
20287 }
20288 case KUNPCK: {
20289 MVT VT = Op.getSimpleValueType();
20290 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
20291
20292 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20293 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20294 // Arguments should be swapped.
20295 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
20296 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
20297 Src2, Src1);
20298 return DAG.getBitcast(VT, Res);
20299 }
20300 case MASK_BINOP: {
20301 MVT VT = Op.getSimpleValueType();
20302 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20303
20304 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20305 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20306 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20307 return DAG.getBitcast(VT, Res);
20308 }
20309 case FIXUPIMMS:
20310 case FIXUPIMMS_MASKZ:
20311 case FIXUPIMM:
20312 case FIXUPIMM_MASKZ:{
20313 SDValue Src1 = Op.getOperand(1);
20314 SDValue Src2 = Op.getOperand(2);
20315 SDValue Src3 = Op.getOperand(3);
20316 SDValue Imm = Op.getOperand(4);
20317 SDValue Mask = Op.getOperand(5);
20318 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20319 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20320 // We specify 2 possible modes for intrinsics, with/without rounding
20321 // modes.
20322 // First, we check if the intrinsic have rounding mode (7 operands),
20323 // if not, we set rounding mode to "current".
20324 SDValue Rnd;
20325 if (Op.getNumOperands() == 7)
20326 Rnd = Op.getOperand(6);
20327 else
20328 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20329 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20330 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20331 Src1, Src2, Src3, Imm, Rnd),
20332 Mask, Passthru, Subtarget, DAG);
20333 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20334 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20335 Src1, Src2, Src3, Imm, Rnd),
20336 Mask, Passthru, Subtarget, DAG);
20337 }
20338 case CONVERT_TO_MASK: {
20339 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
20340 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
20341 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20342
20343 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
20344 Op.getOperand(1));
20345 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20346 DAG.getUNDEF(BitcastVT), CvtMask,
20347 DAG.getIntPtrConstant(0, dl));
20348 return DAG.getBitcast(Op.getValueType(), Res);
20349 }
20350 case ROUNDP: {
20351 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20351, __extension__ __PRETTY_FUNCTION__))
;
20352 // Clear the upper bits of the rounding immediate so that the legacy
20353 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20354 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20355 Op.getOperand(2),
20356 DAG.getConstant(0xf, dl, MVT::i32));
20357 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20358 Op.getOperand(1), RoundingMode);
20359 }
20360 case ROUNDS: {
20361 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20361, __extension__ __PRETTY_FUNCTION__))
;
20362 // Clear the upper bits of the rounding immediate so that the legacy
20363 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20364 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20365 Op.getOperand(3),
20366 DAG.getConstant(0xf, dl, MVT::i32));
20367 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20368 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20369 }
20370 default:
20371 break;
20372 }
20373 }
20374
20375 switch (IntNo) {
20376 default: return SDValue(); // Don't custom lower most intrinsics.
20377
20378 case Intrinsic::x86_avx2_permd:
20379 case Intrinsic::x86_avx2_permps:
20380 // Operands intentionally swapped. Mask is last operand to intrinsic,
20381 // but second operand for node/instruction.
20382 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20383 Op.getOperand(2), Op.getOperand(1));
20384
20385 // ptest and testp intrinsics. The intrinsic these come from are designed to
20386 // return an integer value, not just an instruction so lower it to the ptest
20387 // or testp pattern and a setcc for the result.
20388 case Intrinsic::x86_sse41_ptestz:
20389 case Intrinsic::x86_sse41_ptestc:
20390 case Intrinsic::x86_sse41_ptestnzc:
20391 case Intrinsic::x86_avx_ptestz_256:
20392 case Intrinsic::x86_avx_ptestc_256:
20393 case Intrinsic::x86_avx_ptestnzc_256:
20394 case Intrinsic::x86_avx_vtestz_ps:
20395 case Intrinsic::x86_avx_vtestc_ps:
20396 case Intrinsic::x86_avx_vtestnzc_ps:
20397 case Intrinsic::x86_avx_vtestz_pd:
20398 case Intrinsic::x86_avx_vtestc_pd:
20399 case Intrinsic::x86_avx_vtestnzc_pd:
20400 case Intrinsic::x86_avx_vtestz_ps_256:
20401 case Intrinsic::x86_avx_vtestc_ps_256:
20402 case Intrinsic::x86_avx_vtestnzc_ps_256:
20403 case Intrinsic::x86_avx_vtestz_pd_256:
20404 case Intrinsic::x86_avx_vtestc_pd_256:
20405 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20406 bool IsTestPacked = false;
20407 X86::CondCode X86CC;
20408 switch (IntNo) {
20409 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20409)
;
20410 case Intrinsic::x86_avx_vtestz_ps:
20411 case Intrinsic::x86_avx_vtestz_pd:
20412 case Intrinsic::x86_avx_vtestz_ps_256:
20413 case Intrinsic::x86_avx_vtestz_pd_256:
20414 IsTestPacked = true;
20415 LLVM_FALLTHROUGH[[clang::fallthrough]];
20416 case Intrinsic::x86_sse41_ptestz:
20417 case Intrinsic::x86_avx_ptestz_256:
20418 // ZF = 1
20419 X86CC = X86::COND_E;
20420 break;
20421 case Intrinsic::x86_avx_vtestc_ps:
20422 case Intrinsic::x86_avx_vtestc_pd:
20423 case Intrinsic::x86_avx_vtestc_ps_256:
20424 case Intrinsic::x86_avx_vtestc_pd_256:
20425 IsTestPacked = true;
20426 LLVM_FALLTHROUGH[[clang::fallthrough]];
20427 case Intrinsic::x86_sse41_ptestc:
20428 case Intrinsic::x86_avx_ptestc_256:
20429 // CF = 1
20430 X86CC = X86::COND_B;
20431 break;
20432 case Intrinsic::x86_avx_vtestnzc_ps:
20433 case Intrinsic::x86_avx_vtestnzc_pd:
20434 case Intrinsic::x86_avx_vtestnzc_ps_256:
20435 case Intrinsic::x86_avx_vtestnzc_pd_256:
20436 IsTestPacked = true;
20437 LLVM_FALLTHROUGH[[clang::fallthrough]];
20438 case Intrinsic::x86_sse41_ptestnzc:
20439 case Intrinsic::x86_avx_ptestnzc_256:
20440 // ZF and CF = 0
20441 X86CC = X86::COND_A;
20442 break;
20443 }
20444
20445 SDValue LHS = Op.getOperand(1);
20446 SDValue RHS = Op.getOperand(2);
20447 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20448 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20449 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20450 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20451 }
20452 case Intrinsic::x86_avx512_kortestz_w:
20453 case Intrinsic::x86_avx512_kortestc_w: {
20454 X86::CondCode X86CC =
20455 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20456 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20457 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20458 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20459 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20460 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20461 }
20462
20463 case Intrinsic::x86_avx512_knot_w: {
20464 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20465 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20466 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20467 return DAG.getBitcast(MVT::i16, Res);
20468 }
20469
20470 case Intrinsic::x86_avx512_kandn_w: {
20471 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20472 // Invert LHS for the not.
20473 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20474 DAG.getConstant(1, dl, MVT::v16i1));
20475 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20476 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20477 return DAG.getBitcast(MVT::i16, Res);
20478 }
20479
20480 case Intrinsic::x86_avx512_kxnor_w: {
20481 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20482 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20483 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20484 // Invert result for the not.
20485 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20486 DAG.getConstant(1, dl, MVT::v16i1));
20487 return DAG.getBitcast(MVT::i16, Res);
20488 }
20489
20490 case Intrinsic::x86_sse42_pcmpistria128:
20491 case Intrinsic::x86_sse42_pcmpestria128:
20492 case Intrinsic::x86_sse42_pcmpistric128:
20493 case Intrinsic::x86_sse42_pcmpestric128:
20494 case Intrinsic::x86_sse42_pcmpistrio128:
20495 case Intrinsic::x86_sse42_pcmpestrio128:
20496 case Intrinsic::x86_sse42_pcmpistris128:
20497 case Intrinsic::x86_sse42_pcmpestris128:
20498 case Intrinsic::x86_sse42_pcmpistriz128:
20499 case Intrinsic::x86_sse42_pcmpestriz128: {
20500 unsigned Opcode;
20501 X86::CondCode X86CC;
20502 switch (IntNo) {
20503 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20503)
; // Can't reach here.
20504 case Intrinsic::x86_sse42_pcmpistria128:
20505 Opcode = X86ISD::PCMPISTRI;
20506 X86CC = X86::COND_A;
20507 break;
20508 case Intrinsic::x86_sse42_pcmpestria128:
20509 Opcode = X86ISD::PCMPESTRI;
20510 X86CC = X86::COND_A;
20511 break;
20512 case Intrinsic::x86_sse42_pcmpistric128:
20513 Opcode = X86ISD::PCMPISTRI;
20514 X86CC = X86::COND_B;
20515 break;
20516 case Intrinsic::x86_sse42_pcmpestric128:
20517 Opcode = X86ISD::PCMPESTRI;
20518 X86CC = X86::COND_B;
20519 break;
20520 case Intrinsic::x86_sse42_pcmpistrio128:
20521 Opcode = X86ISD::PCMPISTRI;
20522 X86CC = X86::COND_O;
20523 break;
20524 case Intrinsic::x86_sse42_pcmpestrio128:
20525 Opcode = X86ISD::PCMPESTRI;
20526 X86CC = X86::COND_O;
20527 break;
20528 case Intrinsic::x86_sse42_pcmpistris128:
20529 Opcode = X86ISD::PCMPISTRI;
20530 X86CC = X86::COND_S;
20531 break;
20532 case Intrinsic::x86_sse42_pcmpestris128:
20533 Opcode = X86ISD::PCMPESTRI;
20534 X86CC = X86::COND_S;
20535 break;
20536 case Intrinsic::x86_sse42_pcmpistriz128:
20537 Opcode = X86ISD::PCMPISTRI;
20538 X86CC = X86::COND_E;
20539 break;
20540 case Intrinsic::x86_sse42_pcmpestriz128:
20541 Opcode = X86ISD::PCMPESTRI;
20542 X86CC = X86::COND_E;
20543 break;
20544 }
20545 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20546 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20547 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20548 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20549 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20550 }
20551
20552 case Intrinsic::x86_sse42_pcmpistri128:
20553 case Intrinsic::x86_sse42_pcmpestri128: {
20554 unsigned Opcode;
20555 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20556 Opcode = X86ISD::PCMPISTRI;
20557 else
20558 Opcode = X86ISD::PCMPESTRI;
20559
20560 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20561 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20562 return DAG.getNode(Opcode, dl, VTs, NewOps);
20563 }
20564
20565 case Intrinsic::eh_sjlj_lsda: {
20566 MachineFunction &MF = DAG.getMachineFunction();
20567 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20568 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20569 auto &Context = MF.getMMI().getContext();
20570 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20571 Twine(MF.getFunctionNumber()));
20572 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20573 DAG.getMCSymbol(S, PtrVT));
20574 }
20575
20576 case Intrinsic::x86_seh_lsda: {
20577 // Compute the symbol for the LSDA. We know it'll get emitted later.
20578 MachineFunction &MF = DAG.getMachineFunction();
20579 SDValue Op1 = Op.getOperand(1);
20580 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20581 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20582 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20583
20584 // Generate a simple absolute symbol reference. This intrinsic is only
20585 // supported on 32-bit Windows, which isn't PIC.
20586 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20587 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20588 }
20589
20590 case Intrinsic::x86_seh_recoverfp: {
20591 SDValue FnOp = Op.getOperand(1);
20592 SDValue IncomingFPOp = Op.getOperand(2);
20593 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20594 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20595 if (!Fn)
20596 report_fatal_error(
20597 "llvm.x86.seh.recoverfp must take a function as the first argument");
20598 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20599 }
20600
20601 case Intrinsic::localaddress: {
20602 // Returns one of the stack, base, or frame pointer registers, depending on
20603 // which is used to reference local variables.
20604 MachineFunction &MF = DAG.getMachineFunction();
20605 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20606 unsigned Reg;
20607 if (RegInfo->hasBasePointer(MF))
20608 Reg = RegInfo->getBaseRegister();
20609 else // This function handles the SP or FP case.
20610 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20611 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20612 }
20613 }
20614}
20615
20616static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20617 SDValue Src, SDValue Mask, SDValue Base,
20618 SDValue Index, SDValue ScaleOp, SDValue Chain,
20619 const X86Subtarget &Subtarget) {
20620 SDLoc dl(Op);
20621 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20622 // Scale must be constant.
20623 if (!C)
20624 return SDValue();
20625 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20626 EVT MaskVT = Mask.getValueType();
20627 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20628 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20629 SDValue Segment = DAG.getRegister(0, MVT::i32);
20630 // If source is undef or we know it won't be used, use a zero vector
20631 // to break register dependency.
20632 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20633 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20634 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20635 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20636 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20637 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20638 return DAG.getMergeValues(RetOps, dl);
20639}
20640
20641static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20642 SDValue Src, SDValue Mask, SDValue Base,
20643 SDValue Index, SDValue ScaleOp, SDValue Chain,
20644 const X86Subtarget &Subtarget) {
20645 SDLoc dl(Op);
20646 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20647 // Scale must be constant.
20648 if (!C)
20649 return SDValue();
20650 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20651 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20652 Index.getSimpleValueType().getVectorNumElements());
20653
20654 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20655 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20656 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20657 SDValue Segment = DAG.getRegister(0, MVT::i32);
20658 // If source is undef or we know it won't be used, use a zero vector
20659 // to break register dependency.
20660 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20661 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20662 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20663 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20664 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20665 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20666 return DAG.getMergeValues(RetOps, dl);
20667}
20668
20669static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20670 SDValue Src, SDValue Mask, SDValue Base,
20671 SDValue Index, SDValue ScaleOp, SDValue Chain,
20672 const X86Subtarget &Subtarget) {
20673 SDLoc dl(Op);
20674 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20675 // Scale must be constant.
20676 if (!C)
20677 return SDValue();
20678 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20679 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20680 SDValue Segment = DAG.getRegister(0, MVT::i32);
20681 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20682 Index.getSimpleValueType().getVectorNumElements());
20683
20684 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20685 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20686 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20687 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20688 return SDValue(Res, 1);
20689}
20690
20691static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20692 SDValue Mask, SDValue Base, SDValue Index,
20693 SDValue ScaleOp, SDValue Chain,
20694 const X86Subtarget &Subtarget) {
20695 SDLoc dl(Op);
20696 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20697 // Scale must be constant.
20698 if (!C)
20699 return SDValue();
20700 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20701 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20702 SDValue Segment = DAG.getRegister(0, MVT::i32);
20703 MVT MaskVT =
20704 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20705 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20706 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20707 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20708 return SDValue(Res, 0);
20709}
20710
20711/// Handles the lowering of builtin intrinsic that return the value
20712/// of the extended control register.
20713static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20714 SelectionDAG &DAG,
20715 const X86Subtarget &Subtarget,
20716 SmallVectorImpl<SDValue> &Results) {
20717 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20717, __extension__ __PRETTY_FUNCTION__))
;
20718 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20719 SDValue LO, HI;
20720
20721 // The ECX register is used to select the index of the XCR register to
20722 // return.
20723 SDValue Chain =
20724 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20725 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20726 Chain = SDValue(N1, 0);
20727
20728 // Reads the content of XCR and returns it in registers EDX:EAX.
20729 if (Subtarget.is64Bit()) {
20730 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20731 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20732 LO.getValue(2));
20733 } else {
20734 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20735 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20736 LO.getValue(2));
20737 }
20738 Chain = HI.getValue(1);
20739
20740 if (Subtarget.is64Bit()) {
20741 // Merge the two 32-bit values into a 64-bit one..
20742 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20743 DAG.getConstant(32, DL, MVT::i8));
20744 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20745 Results.push_back(Chain);
20746 return;
20747 }
20748
20749 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20750 SDValue Ops[] = { LO, HI };
20751 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20752 Results.push_back(Pair);
20753 Results.push_back(Chain);
20754}
20755
20756/// Handles the lowering of builtin intrinsics that read performance monitor
20757/// counters (x86_rdpmc).
20758static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20759 SelectionDAG &DAG,
20760 const X86Subtarget &Subtarget,
20761 SmallVectorImpl<SDValue> &Results) {
20762 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20762, __extension__ __PRETTY_FUNCTION__))
;
20763 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20764 SDValue LO, HI;
20765
20766 // The ECX register is used to select the index of the performance counter
20767 // to read.
20768 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20769 N->getOperand(2));
20770 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20771
20772 // Reads the content of a 64-bit performance counter and returns it in the
20773 // registers EDX:EAX.
20774 if (Subtarget.is64Bit()) {
20775 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20776 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20777 LO.getValue(2));
20778 } else {
20779 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20780 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20781 LO.getValue(2));
20782 }
20783 Chain = HI.getValue(1);
20784
20785 if (Subtarget.is64Bit()) {
20786 // The EAX register is loaded with the low-order 32 bits. The EDX register
20787 // is loaded with the supported high-order bits of the counter.
20788 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20789 DAG.getConstant(32, DL, MVT::i8));
20790 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20791 Results.push_back(Chain);
20792 return;
20793 }
20794
20795 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20796 SDValue Ops[] = { LO, HI };
20797 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20798 Results.push_back(Pair);
20799 Results.push_back(Chain);
20800}
20801
20802/// Handles the lowering of builtin intrinsics that read the time stamp counter
20803/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20804/// READCYCLECOUNTER nodes.
20805static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20806 SelectionDAG &DAG,
20807 const X86Subtarget &Subtarget,
20808 SmallVectorImpl<SDValue> &Results) {
20809 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20810 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20811 SDValue LO, HI;
20812
20813 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20814 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20815 // and the EAX register is loaded with the low-order 32 bits.
20816 if (Subtarget.is64Bit()) {
20817 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20818 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20819 LO.getValue(2));
20820 } else {
20821 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20822 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20823 LO.getValue(2));
20824 }
20825 SDValue Chain = HI.getValue(1);
20826
20827 if (Opcode == X86ISD::RDTSCP_DAG) {
20828 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20828, __extension__ __PRETTY_FUNCTION__))
;
20829
20830 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20831 // the ECX register. Add 'ecx' explicitly to the chain.
20832 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20833 HI.getValue(2));
20834 // Explicitly store the content of ECX at the location passed in input
20835 // to the 'rdtscp' intrinsic.
20836 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20837 MachinePointerInfo());
20838 }
20839
20840 if (Subtarget.is64Bit()) {
20841 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20842 // the EAX register is loaded with the low-order 32 bits.
20843 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20844 DAG.getConstant(32, DL, MVT::i8));
20845 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20846 Results.push_back(Chain);
20847 return;
20848 }
20849
20850 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20851 SDValue Ops[] = { LO, HI };
20852 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20853 Results.push_back(Pair);
20854 Results.push_back(Chain);
20855}
20856
20857static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20858 SelectionDAG &DAG) {
20859 SmallVector<SDValue, 2> Results;
20860 SDLoc DL(Op);
20861 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20862 Results);
20863 return DAG.getMergeValues(Results, DL);
20864}
20865
20866static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20867 MachineFunction &MF = DAG.getMachineFunction();
20868 SDValue Chain = Op.getOperand(0);
20869 SDValue RegNode = Op.getOperand(2);
20870 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20871 if (!EHInfo)
20872 report_fatal_error("EH registrations only live in functions using WinEH");
20873
20874 // Cast the operand to an alloca, and remember the frame index.
20875 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20876 if (!FINode)
20877 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20878 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20879
20880 // Return the chain operand without making any DAG nodes.
20881 return Chain;
20882}
20883
20884static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20885 MachineFunction &MF = DAG.getMachineFunction();
20886 SDValue Chain = Op.getOperand(0);
20887 SDValue EHGuard = Op.getOperand(2);
20888 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20889 if (!EHInfo)
20890 report_fatal_error("EHGuard only live in functions using WinEH");
20891
20892 // Cast the operand to an alloca, and remember the frame index.
20893 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20894 if (!FINode)
20895 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20896 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20897
20898 // Return the chain operand without making any DAG nodes.
20899 return Chain;
20900}
20901
20902/// Emit Truncating Store with signed or unsigned saturation.
20903static SDValue
20904EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20905 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20906 SelectionDAG &DAG) {
20907
20908 SDVTList VTs = DAG.getVTList(MVT::Other);
20909 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20910 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20911 return SignedSat ?
20912 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20913 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20914}
20915
20916/// Emit Masked Truncating Store with signed or unsigned saturation.
20917static SDValue
20918EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20919 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20920 MachineMemOperand *MMO, SelectionDAG &DAG) {
20921
20922 SDVTList VTs = DAG.getVTList(MVT::Other);
20923 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20924 return SignedSat ?
20925 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20926 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20927}
20928
20929static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20930 SelectionDAG &DAG) {
20931 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20932
20933 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20934 if (!IntrData) {
20935 switch (IntNo) {
20936 case llvm::Intrinsic::x86_seh_ehregnode:
20937 return MarkEHRegistrationNode(Op, DAG);
20938 case llvm::Intrinsic::x86_seh_ehguard:
20939 return MarkEHGuard(Op, DAG);
20940 case llvm::Intrinsic::x86_flags_read_u32:
20941 case llvm::Intrinsic::x86_flags_read_u64:
20942 case llvm::Intrinsic::x86_flags_write_u32:
20943 case llvm::Intrinsic::x86_flags_write_u64: {
20944 // We need a frame pointer because this will get lowered to a PUSH/POP
20945 // sequence.
20946 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20947 MFI.setHasCopyImplyingStackAdjustment(true);
20948 // Don't do anything here, we will expand these intrinsics out later
20949 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20950 return SDValue();
20951 }
20952 case Intrinsic::x86_lwpins32:
20953 case Intrinsic::x86_lwpins64: {
20954 SDLoc dl(Op);
20955 SDValue Chain = Op->getOperand(0);
20956 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20957 SDValue LwpIns =
20958 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20959 Op->getOperand(3), Op->getOperand(4));
20960 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20961 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20962 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20963 LwpIns.getValue(1));
20964 }
20965 }
20966 return SDValue();
20967 }
20968
20969 SDLoc dl(Op);
20970 switch(IntrData->Type) {
20971 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 20971)
;
20972 case RDSEED:
20973 case RDRAND: {
20974 // Emit the node with the right value type.
20975 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
20976 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20977
20978 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20979 // Otherwise return the value from Rand, which is always 0, casted to i32.
20980 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20981 DAG.getConstant(1, dl, Op->getValueType(1)),
20982 DAG.getConstant(X86::COND_B, dl, MVT::i8),
20983 SDValue(Result.getNode(), 1) };
20984 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
20985
20986 // Return { result, isValid, chain }.
20987 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20988 SDValue(Result.getNode(), 2));
20989 }
20990 case GATHER_AVX2: {
20991 SDValue Chain = Op.getOperand(0);
20992 SDValue Src = Op.getOperand(2);
20993 SDValue Base = Op.getOperand(3);
20994 SDValue Index = Op.getOperand(4);
20995 SDValue Mask = Op.getOperand(5);
20996 SDValue Scale = Op.getOperand(6);
20997 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20998 Scale, Chain, Subtarget);
20999 }
21000 case GATHER: {
21001 //gather(v1, mask, index, base, scale);
21002 SDValue Chain = Op.getOperand(0);
21003 SDValue Src = Op.getOperand(2);
21004 SDValue Base = Op.getOperand(3);
21005 SDValue Index = Op.getOperand(4);
21006 SDValue Mask = Op.getOperand(5);
21007 SDValue Scale = Op.getOperand(6);
21008 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21009 Chain, Subtarget);
21010 }
21011 case SCATTER: {
21012 //scatter(base, mask, index, v1, scale);
21013 SDValue Chain = Op.getOperand(0);
21014 SDValue Base = Op.getOperand(2);
21015 SDValue Mask = Op.getOperand(3);
21016 SDValue Index = Op.getOperand(4);
21017 SDValue Src = Op.getOperand(5);
21018 SDValue Scale = Op.getOperand(6);
21019 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21020 Scale, Chain, Subtarget);
21021 }
21022 case PREFETCH: {
21023 SDValue Hint = Op.getOperand(6);
21024 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21025 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21026, __extension__ __PRETTY_FUNCTION__))
21026 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21026, __extension__ __PRETTY_FUNCTION__))
;
21027 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21028 SDValue Chain = Op.getOperand(0);
21029 SDValue Mask = Op.getOperand(2);
21030 SDValue Index = Op.getOperand(3);
21031 SDValue Base = Op.getOperand(4);
21032 SDValue Scale = Op.getOperand(5);
21033 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21034 Subtarget);
21035 }
21036 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21037 case RDTSC: {
21038 SmallVector<SDValue, 2> Results;
21039 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21040 Results);
21041 return DAG.getMergeValues(Results, dl);
21042 }
21043 // Read Performance Monitoring Counters.
21044 case RDPMC: {
21045 SmallVector<SDValue, 2> Results;
21046 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21047 return DAG.getMergeValues(Results, dl);
21048 }
21049 // Get Extended Control Register.
21050 case XGETBV: {
21051 SmallVector<SDValue, 2> Results;
21052 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21053 return DAG.getMergeValues(Results, dl);
21054 }
21055 // XTEST intrinsics.
21056 case XTEST: {
21057 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21058 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21059
21060 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21061 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21062 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21063 Ret, SDValue(InTrans.getNode(), 1));
21064 }
21065 // ADC/ADCX/SBB
21066 case ADX: {
21067 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21068 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
21069 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21070 DAG.getConstant(-1, dl, MVT::i8));
21071 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21072 Op.getOperand(4), GenCF.getValue(1));
21073 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21074 Op.getOperand(5), MachinePointerInfo());
21075 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21076 SDValue Results[] = { SetCC, Store };
21077 return DAG.getMergeValues(Results, dl);
21078 }
21079 case COMPRESS_TO_MEM: {
21080 SDValue Mask = Op.getOperand(4);
21081 SDValue DataToCompress = Op.getOperand(3);
21082 SDValue Addr = Op.getOperand(2);
21083 SDValue Chain = Op.getOperand(0);
21084 MVT VT = DataToCompress.getSimpleValueType();
21085
21086 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21087 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21087, __extension__ __PRETTY_FUNCTION__))
;
21088
21089 if (isAllOnesConstant(Mask)) // return just a store
21090 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21091 MemIntr->getMemOperand());
21092
21093 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21094 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21095
21096 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21097 MemIntr->getMemOperand(),
21098 false /* truncating */, true /* compressing */);
21099 }
21100 case TRUNCATE_TO_MEM_VI8:
21101 case TRUNCATE_TO_MEM_VI16:
21102 case TRUNCATE_TO_MEM_VI32: {
21103 SDValue Mask = Op.getOperand(4);
21104 SDValue DataToTruncate = Op.getOperand(3);
21105 SDValue Addr = Op.getOperand(2);
21106 SDValue Chain = Op.getOperand(0);
21107
21108 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21109 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21109, __extension__ __PRETTY_FUNCTION__))
;
21110
21111 EVT MemVT = MemIntr->getMemoryVT();
21112
21113 uint16_t TruncationOp = IntrData->Opc0;
21114 switch (TruncationOp) {
21115 case X86ISD::VTRUNC: {
21116 if (isAllOnesConstant(Mask)) // return just a truncate store
21117 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21118 MemIntr->getMemOperand());
21119
21120 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21121 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21122
21123 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21124 MemIntr->getMemOperand(), true /* truncating */);
21125 }
21126 case X86ISD::VTRUNCUS:
21127 case X86ISD::VTRUNCS: {
21128 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21129 if (isAllOnesConstant(Mask))
21130 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21131 MemIntr->getMemOperand(), DAG);
21132
21133 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21134 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21135
21136 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21137 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21138 }
21139 default:
21140 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21140)
;
21141 }
21142 }
21143
21144 case EXPAND_FROM_MEM: {
21145 SDValue Mask = Op.getOperand(4);
21146 SDValue PassThru = Op.getOperand(3);
21147 SDValue Addr = Op.getOperand(2);
21148 SDValue Chain = Op.getOperand(0);
21149 MVT VT = Op.getSimpleValueType();
21150
21151 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21152 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21152, __extension__ __PRETTY_FUNCTION__))
;
21153
21154 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21155 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21156 if (X86::isZeroNode(Mask))
21157 return DAG.getUNDEF(VT);
21158
21159 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21160 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21161 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21162 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21163 true /* expanding */);
21164 }
21165 }
21166}
21167
21168SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21169 SelectionDAG &DAG) const {
21170 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21171 MFI.setReturnAddressIsTaken(true);
21172
21173 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21174 return SDValue();
21175
21176 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21177 SDLoc dl(Op);
21178 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21179
21180 if (Depth > 0) {
21181 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21182 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21183 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21184 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21185 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21186 MachinePointerInfo());
21187 }
21188
21189 // Just load the return address.
21190 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21191 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21192 MachinePointerInfo());
21193}
21194
21195SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21196 SelectionDAG &DAG) const {
21197 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21198 return getReturnAddressFrameIndex(DAG);
21199}
21200
21201SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21202 MachineFunction &MF = DAG.getMachineFunction();
21203 MachineFrameInfo &MFI = MF.getFrameInfo();
21204 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21205 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21206 EVT VT = Op.getValueType();
21207
21208 MFI.setFrameAddressIsTaken(true);
21209
21210 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21211 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21212 // is not possible to crawl up the stack without looking at the unwind codes
21213 // simultaneously.
21214 int FrameAddrIndex = FuncInfo->getFAIndex();
21215 if (!FrameAddrIndex) {
21216 // Set up a frame object for the return address.
21217 unsigned SlotSize = RegInfo->getSlotSize();
21218 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21219 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21220 FuncInfo->setFAIndex(FrameAddrIndex);
21221 }
21222 return DAG.getFrameIndex(FrameAddrIndex, VT);
21223 }
21224
21225 unsigned FrameReg =
21226 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21227 SDLoc dl(Op); // FIXME probably not meaningful
21228 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21229 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__))
21230 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__))
21231 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21231, __extension__ __PRETTY_FUNCTION__))
;
21232 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21233 while (Depth--)
21234 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21235 MachinePointerInfo());
21236 return FrameAddr;
21237}
21238
21239// FIXME? Maybe this could be a TableGen attribute on some registers and
21240// this table could be generated automatically from RegInfo.
21241unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21242 SelectionDAG &DAG) const {
21243 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21244 const MachineFunction &MF = DAG.getMachineFunction();
21245
21246 unsigned Reg = StringSwitch<unsigned>(RegName)
21247 .Case("esp", X86::ESP)
21248 .Case("rsp", X86::RSP)
21249 .Case("ebp", X86::EBP)
21250 .Case("rbp", X86::RBP)
21251 .Default(0);
21252
21253 if (Reg == X86::EBP || Reg == X86::RBP) {
21254 if (!TFI.hasFP(MF))
21255 report_fatal_error("register " + StringRef(RegName) +
21256 " is allocatable: function has no frame pointer");
21257#ifndef NDEBUG
21258 else {
21259 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21260 unsigned FrameReg =
21261 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21262 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21263, __extension__ __PRETTY_FUNCTION__))
21263 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21263, __extension__ __PRETTY_FUNCTION__))
;
21264 }
21265#endif
21266 }
21267
21268 if (Reg)
21269 return Reg;
21270
21271 report_fatal_error("Invalid register name global variable");
21272}
21273
21274SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21275 SelectionDAG &DAG) const {
21276 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21277 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21278}
21279
21280unsigned X86TargetLowering::getExceptionPointerRegister(
21281 const Constant *PersonalityFn) const {
21282 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21283 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21284
21285 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21286}
21287
21288unsigned X86TargetLowering::getExceptionSelectorRegister(
21289 const Constant *PersonalityFn) const {
21290 // Funclet personalities don't use selectors (the runtime does the selection).
21291 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))(static_cast <bool> (!isFuncletEHPersonality(classifyEHPersonality
(PersonalityFn))) ? void (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21291, __extension__ __PRETTY_FUNCTION__))
;
21292 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21293}
21294
21295bool X86TargetLowering::needsFixedCatchObjects() const {
21296 return Subtarget.isTargetWin64();
21297}
21298
21299SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21300 SDValue Chain = Op.getOperand(0);
21301 SDValue Offset = Op.getOperand(1);
21302 SDValue Handler = Op.getOperand(2);
21303 SDLoc dl (Op);
21304
21305 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21306 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21307 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21308 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__))
21309 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__))
21310 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21310, __extension__ __PRETTY_FUNCTION__))
;
21311 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21312 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21313
21314 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21315 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21316 dl));
21317 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21318 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21319 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21320
21321 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21322 DAG.getRegister(StoreAddrReg, PtrVT));
21323}
21324
21325SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21326 SelectionDAG &DAG) const {
21327 SDLoc DL(Op);
21328 // If the subtarget is not 64bit, we may need the global base reg
21329 // after isel expand pseudo, i.e., after CGBR pass ran.
21330 // Therefore, ask for the GlobalBaseReg now, so that the pass
21331 // inserts the code for us in case we need it.
21332 // Otherwise, we will end up in a situation where we will
21333 // reference a virtual register that is not defined!
21334 if (!Subtarget.is64Bit()) {
21335 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21336 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21337 }
21338 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21339 DAG.getVTList(MVT::i32, MVT::Other),
21340 Op.getOperand(0), Op.getOperand(1));
21341}
21342
21343SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21344 SelectionDAG &DAG) const {
21345 SDLoc DL(Op);
21346 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21347 Op.getOperand(0), Op.getOperand(1));
21348}
21349
21350SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21351 SelectionDAG &DAG) const {
21352 SDLoc DL(Op);
21353 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21354 Op.getOperand(0));
21355}
21356
21357static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21358 return Op.getOperand(0);
21359}
21360
21361SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21362 SelectionDAG &DAG) const {
21363 SDValue Root = Op.getOperand(0);
21364 SDValue Trmp = Op.getOperand(1); // trampoline
21365 SDValue FPtr = Op.getOperand(2); // nested function
21366 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21367 SDLoc dl (Op);
21368
21369 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21370 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21371
21372 if (Subtarget.is64Bit()) {
21373 SDValue OutChains[6];
21374
21375 // Large code-model.
21376 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21377 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21378
21379 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21380 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21381
21382 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21383
21384 // Load the pointer to the nested function into R11.
21385 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21386 SDValue Addr = Trmp;
21387 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21388 Addr, MachinePointerInfo(TrmpAddr));
21389
21390 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21391 DAG.getConstant(2, dl, MVT::i64));
21392 OutChains[1] =
21393 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21394 /* Alignment = */ 2);
21395
21396 // Load the 'nest' parameter value into R10.
21397 // R10 is specified in X86CallingConv.td
21398 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21399 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21400 DAG.getConstant(10, dl, MVT::i64));
21401 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21402 Addr, MachinePointerInfo(TrmpAddr, 10));
21403
21404 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21405 DAG.getConstant(12, dl, MVT::i64));
21406 OutChains[3] =
21407 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21408 /* Alignment = */ 2);
21409
21410 // Jump to the nested function.
21411 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21412 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21413 DAG.getConstant(20, dl, MVT::i64));
21414 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21415 Addr, MachinePointerInfo(TrmpAddr, 20));
21416
21417 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21418 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21419 DAG.getConstant(22, dl, MVT::i64));
21420 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21421 Addr, MachinePointerInfo(TrmpAddr, 22));
21422
21423 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21424 } else {
21425 const Function *Func =
21426 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21427 CallingConv::ID CC = Func->getCallingConv();
21428 unsigned NestReg;
21429
21430 switch (CC) {
21431 default:
21432 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21432)
;
21433 case CallingConv::C:
21434 case CallingConv::X86_StdCall: {
21435 // Pass 'nest' parameter in ECX.
21436 // Must be kept in sync with X86CallingConv.td
21437 NestReg = X86::ECX;
21438
21439 // Check that ECX wasn't needed by an 'inreg' parameter.
21440 FunctionType *FTy = Func->getFunctionType();
21441 const AttributeList &Attrs = Func->getAttributes();
21442
21443 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21444 unsigned InRegCount = 0;
21445 unsigned Idx = 1;
21446
21447 for (FunctionType::param_iterator I = FTy->param_begin(),
21448 E = FTy->param_end(); I != E; ++I, ++Idx)
21449 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21450 auto &DL = DAG.getDataLayout();
21451 // FIXME: should only count parameters that are lowered to integers.
21452 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21453 }
21454
21455 if (InRegCount > 2) {
21456 report_fatal_error("Nest register in use - reduce number of inreg"
21457 " parameters!");
21458 }
21459 }
21460 break;
21461 }
21462 case CallingConv::X86_FastCall:
21463 case CallingConv::X86_ThisCall:
21464 case CallingConv::Fast:
21465 // Pass 'nest' parameter in EAX.
21466 // Must be kept in sync with X86CallingConv.td
21467 NestReg = X86::EAX;
21468 break;
21469 }
21470
21471 SDValue OutChains[4];
21472 SDValue Addr, Disp;
21473
21474 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21475 DAG.getConstant(10, dl, MVT::i32));
21476 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21477
21478 // This is storing the opcode for MOV32ri.
21479 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21480 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21481 OutChains[0] =
21482 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21483 Trmp, MachinePointerInfo(TrmpAddr));
21484
21485 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21486 DAG.getConstant(1, dl, MVT::i32));
21487 OutChains[1] =
21488 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21489 /* Alignment = */ 1);
21490
21491 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21492 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21493 DAG.getConstant(5, dl, MVT::i32));
21494 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21495 Addr, MachinePointerInfo(TrmpAddr, 5),
21496 /* Alignment = */ 1);
21497
21498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21499 DAG.getConstant(6, dl, MVT::i32));
21500 OutChains[3] =
21501 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21502 /* Alignment = */ 1);
21503
21504 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21505 }
21506}
21507
21508SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21509 SelectionDAG &DAG) const {
21510 /*
21511 The rounding mode is in bits 11:10 of FPSR, and has the following
21512 settings:
21513 00 Round to nearest
21514 01 Round to -inf
21515 10 Round to +inf
21516 11 Round to 0
21517
21518 FLT_ROUNDS, on the other hand, expects the following:
21519 -1 Undefined
21520 0 Round to 0
21521 1 Round to nearest
21522 2 Round to +inf
21523 3 Round to -inf
21524
21525 To perform the conversion, we do:
21526 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21527 */
21528
21529 MachineFunction &MF = DAG.getMachineFunction();
21530 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21531 unsigned StackAlignment = TFI.getStackAlignment();
21532 MVT VT = Op.getSimpleValueType();
21533 SDLoc DL(Op);
21534
21535 // Save FP Control Word to stack slot
21536 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21537 SDValue StackSlot =
21538 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21539
21540 MachineMemOperand *MMO =
21541 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21542 MachineMemOperand::MOStore, 2, 2);
21543
21544 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21545 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21546 DAG.getVTList(MVT::Other),
21547 Ops, MVT::i16, MMO);
21548
21549 // Load FP Control Word from stack slot
21550 SDValue CWD =
21551 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21552
21553 // Transform as necessary
21554 SDValue CWD1 =
21555 DAG.getNode(ISD::SRL, DL, MVT::i16,
21556 DAG.getNode(ISD::AND, DL, MVT::i16,
21557 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21558 DAG.getConstant(11, DL, MVT::i8));
21559 SDValue CWD2 =
21560 DAG.getNode(ISD::SRL, DL, MVT::i16,
21561 DAG.getNode(ISD::AND, DL, MVT::i16,
21562 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21563 DAG.getConstant(9, DL, MVT::i8));
21564
21565 SDValue RetVal =
21566 DAG.getNode(ISD::AND, DL, MVT::i16,
21567 DAG.getNode(ISD::ADD, DL, MVT::i16,
21568 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21569 DAG.getConstant(1, DL, MVT::i16)),
21570 DAG.getConstant(3, DL, MVT::i16));
21571
21572 return DAG.getNode((VT.getSizeInBits() < 16 ?
21573 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21574}
21575
21576// Split an unary integer op into 2 half sized ops.
21577static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21578 MVT VT = Op.getSimpleValueType();
21579 unsigned NumElems = VT.getVectorNumElements();
21580 unsigned SizeInBits = VT.getSizeInBits();
21581
21582 // Extract the Lo/Hi vectors
21583 SDLoc dl(Op);
21584 SDValue Src = Op.getOperand(0);
21585 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21586 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21587
21588 MVT EltVT = VT.getVectorElementType();
21589 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21590 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21591 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21592 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21593}
21594
21595// Decompose 256-bit ops into smaller 128-bit ops.
21596static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21597 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__))
21598 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__))
21599 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21599, __extension__ __PRETTY_FUNCTION__))
;
21600 return LowerVectorIntUnary(Op, DAG);
21601}
21602
21603// Decompose 512-bit ops into smaller 256-bit ops.
21604static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21605 assert(Op.getSimpleValueType().is512BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__))
21606 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__))
21607 "Only handle AVX 512-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is512BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21607, __extension__ __PRETTY_FUNCTION__))
;
21608 return LowerVectorIntUnary(Op, DAG);
21609}
21610
21611/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21612//
21613// i8/i16 vector implemented using dword LZCNT vector instruction
21614// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21615// split the vector, perform operation on it's Lo a Hi part and
21616// concatenate the results.
21617static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21618 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21618, __extension__ __PRETTY_FUNCTION__))
;
21619 SDLoc dl(Op);
21620 MVT VT = Op.getSimpleValueType();
21621 MVT EltVT = VT.getVectorElementType();
21622 unsigned NumElems = VT.getVectorNumElements();
21623
21624 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __extension__ __PRETTY_FUNCTION__))
21625 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21625, __extension__ __PRETTY_FUNCTION__))
;
21626
21627 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21628 if (16 < NumElems)
21629 return LowerVectorIntUnary(Op, DAG);
21630
21631 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21632 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21633, __extension__ __PRETTY_FUNCTION__))
21633 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21633, __extension__ __PRETTY_FUNCTION__))
;
21634
21635 // Use native supported vector instruction vplzcntd.
21636 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21637 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21638 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21639 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21640
21641 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21642}
21643
21644// Lower CTLZ using a PSHUFB lookup table implementation.
21645static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21646 const X86Subtarget &Subtarget,
21647 SelectionDAG &DAG) {
21648 MVT VT = Op.getSimpleValueType();
21649 int NumElts = VT.getVectorNumElements();
21650 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21651 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21652
21653 // Per-nibble leading zero PSHUFB lookup table.
21654 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21655 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21656 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21657 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21658
21659 SmallVector<SDValue, 64> LUTVec;
21660 for (int i = 0; i < NumBytes; ++i)
21661 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21662 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21663
21664 // Begin by bitcasting the input to byte vector, then split those bytes
21665 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21666 // If the hi input nibble is zero then we add both results together, otherwise
21667 // we just take the hi result (by masking the lo result to zero before the
21668 // add).
21669 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21670 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21671
21672 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21673 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21674 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21675 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21676 SDValue HiZ;
21677 if (CurrVT.is512BitVector()) {
21678 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21679 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21680 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21681 } else {
21682 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21683 }
21684
21685 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21686 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21687 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21688 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21689
21690 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21691 // of the current vector width in the same way we did for the nibbles.
21692 // If the upper half of the input element is zero then add the halves'
21693 // leading zero counts together, otherwise just use the upper half's.
21694 // Double the width of the result until we are at target width.
21695 while (CurrVT != VT) {
21696 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21697 int CurrNumElts = CurrVT.getVectorNumElements();
21698 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21699 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21700 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21701
21702 // Check if the upper half of the input element is zero.
21703 if (CurrVT.is512BitVector()) {
21704 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21705 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21706 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21707 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21708 } else {
21709 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21710 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21711 }
21712 HiZ = DAG.getBitcast(NextVT, HiZ);
21713
21714 // Move the upper/lower halves to the lower bits as we'll be extending to
21715 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21716 // together.
21717 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21718 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21719 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21720 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21721 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21722 CurrVT = NextVT;
21723 }
21724
21725 return Res;
21726}
21727
21728static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21729 const X86Subtarget &Subtarget,
21730 SelectionDAG &DAG) {
21731 MVT VT = Op.getSimpleValueType();
21732
21733 if (Subtarget.hasCDI())
21734 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21735
21736 // Decompose 256-bit ops into smaller 128-bit ops.
21737 if (VT.is256BitVector() && !Subtarget.hasInt256())
21738 return Lower256IntUnary(Op, DAG);
21739
21740 // Decompose 512-bit ops into smaller 256-bit ops.
21741 if (VT.is512BitVector() && !Subtarget.hasBWI())
21742 return Lower512IntUnary(Op, DAG);
21743
21744 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21744, __extension__ __PRETTY_FUNCTION__))
;
21745 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21746}
21747
21748static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21749 SelectionDAG &DAG) {
21750 MVT VT = Op.getSimpleValueType();
21751 MVT OpVT = VT;
21752 unsigned NumBits = VT.getSizeInBits();
21753 SDLoc dl(Op);
21754 unsigned Opc = Op.getOpcode();
21755
21756 if (VT.isVector())
21757 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21758
21759 Op = Op.getOperand(0);
21760 if (VT == MVT::i8) {
21761 // Zero extend to i32 since there is not an i8 bsr.
21762 OpVT = MVT::i32;
21763 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21764 }
21765
21766 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21767 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21768 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21769
21770 if (Opc == ISD::CTLZ) {
21771 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21772 SDValue Ops[] = {
21773 Op,
21774 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21775 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21776 Op.getValue(1)
21777 };
21778 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21779 }
21780
21781 // Finally xor with NumBits-1.
21782 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21783 DAG.getConstant(NumBits - 1, dl, OpVT));
21784
21785 if (VT == MVT::i8)
21786 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21787 return Op;
21788}
21789
21790static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21791 MVT VT = Op.getSimpleValueType();
21792 unsigned NumBits = VT.getScalarSizeInBits();
21793 SDLoc dl(Op);
21794
21795 if (VT.isVector()) {
21796 SDValue N0 = Op.getOperand(0);
21797 SDValue Zero = DAG.getConstant(0, dl, VT);
21798
21799 // lsb(x) = (x & -x)
21800 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21801 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21802
21803 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21804 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21805 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21806 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21807 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21808 }
21809
21810 // cttz(x) = ctpop(lsb - 1)
21811 SDValue One = DAG.getConstant(1, dl, VT);
21812 return DAG.getNode(ISD::CTPOP, dl, VT,
21813 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21814 }
21815
21816 assert(Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? void (0) : __assert_fail
("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21817, __extension__ __PRETTY_FUNCTION__))
21817 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? void (0) : __assert_fail
("Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21817, __extension__ __PRETTY_FUNCTION__))
;
21818
21819 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21820 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21821 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21822
21823 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21824 SDValue Ops[] = {
21825 Op,
21826 DAG.getConstant(NumBits, dl, VT),
21827 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21828 Op.getValue(1)
21829 };
21830 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21831}
21832
21833/// Break a 256-bit integer operation into two new 128-bit ones and then
21834/// concatenate the result back.
21835static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21836 MVT VT = Op.getSimpleValueType();
21837
21838 assert(VT.is256BitVector() && VT.isInteger() &&(static_cast <bool> (VT.is256BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __extension__ __PRETTY_FUNCTION__))
21839 "Unsupported value type for operation")(static_cast <bool> (VT.is256BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __extension__ __PRETTY_FUNCTION__))
;
21840
21841 unsigned NumElems = VT.getVectorNumElements();
21842 SDLoc dl(Op);
21843
21844 // Extract the LHS vectors
21845 SDValue LHS = Op.getOperand(0);
21846 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21847 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21848
21849 // Extract the RHS vectors
21850 SDValue RHS = Op.getOperand(1);
21851 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21852 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21853
21854 MVT EltVT = VT.getVectorElementType();
21855 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21856
21857 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21858 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21859 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21860}
21861
21862/// Break a 512-bit integer operation into two new 256-bit ones and then
21863/// concatenate the result back.
21864static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21865 MVT VT = Op.getSimpleValueType();
21866
21867 assert(VT.is512BitVector() && VT.isInteger() &&(static_cast <bool> (VT.is512BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __extension__ __PRETTY_FUNCTION__))
21868 "Unsupported value type for operation")(static_cast <bool> (VT.is512BitVector() && VT.
isInteger() && "Unsupported value type for operation"
) ? void (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __extension__ __PRETTY_FUNCTION__))
;
21869
21870 unsigned NumElems = VT.getVectorNumElements();
21871 SDLoc dl(Op);
21872
21873 // Extract the LHS vectors
21874 SDValue LHS = Op.getOperand(0);
21875 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21876 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21877
21878 // Extract the RHS vectors
21879 SDValue RHS = Op.getOperand(1);
21880 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21881 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21882
21883 MVT EltVT = VT.getVectorElementType();
21884 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21885
21886 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21887 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21888 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21889}
21890
21891static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21892 MVT VT = Op.getSimpleValueType();
21893 if (VT.getScalarType() == MVT::i1)
21894 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21895 Op.getOperand(0), Op.getOperand(1));
21896 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21898, __extension__ __PRETTY_FUNCTION__))
21897 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21898, __extension__ __PRETTY_FUNCTION__))
21898 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21898, __extension__ __PRETTY_FUNCTION__))
;
21899 return Lower256IntArith(Op, DAG);
21900}
21901
21902static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21903 MVT VT = Op.getSimpleValueType();
21904 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
21905 // Since X86 does not have CMOV for 8-bit integer, we don't convert
21906 // 8-bit integer abs to NEG and CMOV.
21907 SDLoc DL(Op);
21908 SDValue N0 = Op.getOperand(0);
21909 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
21910 DAG.getConstant(0, DL, VT), N0);
21911 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
21912 SDValue(Neg.getNode(), 1)};
21913 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
21914 }
21915
21916 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21918, __extension__ __PRETTY_FUNCTION__))
21917 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21918, __extension__ __PRETTY_FUNCTION__))
21918 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21918, __extension__ __PRETTY_FUNCTION__))
;
21919 return Lower256IntUnary(Op, DAG);
21920}
21921
21922static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21923 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21925, __extension__ __PRETTY_FUNCTION__))
21924 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21925, __extension__ __PRETTY_FUNCTION__))
21925 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21925, __extension__ __PRETTY_FUNCTION__))
;
21926 return Lower256IntArith(Op, DAG);
21927}
21928
21929static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21930 SelectionDAG &DAG) {
21931 SDLoc dl(Op);
21932 MVT VT = Op.getSimpleValueType();
21933
21934 if (VT.getScalarType() == MVT::i1)
21935 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21936
21937 // Decompose 256-bit ops into smaller 128-bit ops.
21938 if (VT.is256BitVector() && !Subtarget.hasInt256())
21939 return Lower256IntArith(Op, DAG);
21940
21941 SDValue A = Op.getOperand(0);
21942 SDValue B = Op.getOperand(1);
21943
21944 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21945 // vector pairs, multiply and truncate.
21946 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21947 if (Subtarget.hasInt256()) {
21948 // For 512-bit vectors, split into 256-bit vectors to allow the
21949 // sign-extension to occur.
21950 if (VT == MVT::v64i8)
21951 return Lower512IntArith(Op, DAG);
21952
21953 // For 256-bit vectors, split into 128-bit vectors to allow the
21954 // sign-extension to occur. We don't need this on AVX512BW as we can
21955 // safely sign-extend to v32i16.
21956 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21957 return Lower256IntArith(Op, DAG);
21958
21959 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21960 return DAG.getNode(
21961 ISD::TRUNCATE, dl, VT,
21962 DAG.getNode(ISD::MUL, dl, ExVT,
21963 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21964 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21965 }
21966
21967 assert(VT == MVT::v16i8 &&(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21968, __extension__ __PRETTY_FUNCTION__))
21968 "Pre-AVX2 support only supports v16i8 multiplication")(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 21968, __extension__ __PRETTY_FUNCTION__))
;
21969 MVT ExVT = MVT::v8i16;
21970
21971 // Extract the lo parts and sign extend to i16
21972 SDValue ALo, BLo;
21973 if (Subtarget.hasSSE41()) {
21974 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21975 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21976 } else {
21977 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21978 -1, 4, -1, 5, -1, 6, -1, 7};
21979 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21980 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21981 ALo = DAG.getBitcast(ExVT, ALo);
21982 BLo = DAG.getBitcast(ExVT, BLo);
21983 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21984 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21985 }
21986
21987 // Extract the hi parts and sign extend to i16
21988 SDValue AHi, BHi;
21989 if (Subtarget.hasSSE41()) {
21990 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21991 -1, -1, -1, -1, -1, -1, -1, -1};
21992 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21993 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21994 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21995 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21996 } else {
21997 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21998 -1, 12, -1, 13, -1, 14, -1, 15};
21999 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22000 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22001 AHi = DAG.getBitcast(ExVT, AHi);
22002 BHi = DAG.getBitcast(ExVT, BHi);
22003 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22004 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22005 }
22006
22007 // Multiply, mask the lower 8bits of the lo/hi results and pack
22008 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22009 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22010 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22011 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22012 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22013 }
22014
22015 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22016 if (VT == MVT::v4i32) {
22017 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmuldq is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22018, __extension__ __PRETTY_FUNCTION__))
22018 "Should not custom lower when pmuldq is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmuldq is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmuldq is available!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22018, __extension__ __PRETTY_FUNCTION__))
;
22019
22020 // Extract the odd parts.
22021 static const int UnpackMask[] = { 1, -1, 3, -1 };
22022 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22023 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22024
22025 // Multiply the even parts.
22026 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
22027 // Now multiply odd parts.
22028 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
22029
22030 Evens = DAG.getBitcast(VT, Evens);
22031 Odds = DAG.getBitcast(VT, Odds);
22032
22033 // Merge the two vectors back together with a shuffle. This expands into 2
22034 // shuffles.
22035 static const int ShufMask[] = { 0, 4, 2, 6 };
22036 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22037 }
22038
22039 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22040, __extension__ __PRETTY_FUNCTION__))
22040 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22040, __extension__ __PRETTY_FUNCTION__))
;
22041
22042 // 32-bit vector types used for MULDQ/MULUDQ.
22043 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22044
22045 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22046 // 32-bits. We can lower with this if the sign bits stretch that far.
22047 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22048 DAG.ComputeNumSignBits(B) > 32) {
22049 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22050 DAG.getBitcast(MulVT, B));
22051 }
22052
22053 // Ahi = psrlqi(a, 32);
22054 // Bhi = psrlqi(b, 32);
22055 //
22056 // AloBlo = pmuludq(a, b);
22057 // AloBhi = pmuludq(a, Bhi);
22058 // AhiBlo = pmuludq(Ahi, b);
22059 //
22060 // Hi = psllqi(AloBhi + AhiBlo, 32);
22061 // return AloBlo + Hi;
22062 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22063 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22064 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22065
22066 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22067 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22068 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22069
22070 // Bit cast to 32-bit vectors for MULUDQ.
22071 SDValue Alo = DAG.getBitcast(MulVT, A);
22072 SDValue Blo = DAG.getBitcast(MulVT, B);
22073
22074 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22075
22076 // Only multiply lo/hi halves that aren't known to be zero.
22077 SDValue AloBlo = Zero;
22078 if (!ALoIsZero && !BLoIsZero)
22079 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22080
22081 SDValue AloBhi = Zero;
22082 if (!ALoIsZero && !BHiIsZero) {
22083 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22084 Bhi = DAG.getBitcast(MulVT, Bhi);
22085 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22086 }
22087
22088 SDValue AhiBlo = Zero;
22089 if (!AHiIsZero && !BLoIsZero) {
22090 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22091 Ahi = DAG.getBitcast(MulVT, Ahi);
22092 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22093 }
22094
22095 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22096 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22097
22098 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22099}
22100
22101static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22102 SelectionDAG &DAG) {
22103 SDLoc dl(Op);
22104 MVT VT = Op.getSimpleValueType();
22105
22106 // Decompose 256-bit ops into smaller 128-bit ops.
22107 if (VT.is256BitVector() && !Subtarget.hasInt256())
22108 return Lower256IntArith(Op, DAG);
22109
22110 // Only i8 vectors should need custom lowering after this.
22111 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__))
22112 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__))
22113 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22113, __extension__ __PRETTY_FUNCTION__))
;
22114
22115 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22116 // logical shift down the upper half and pack back to i8.
22117 SDValue A = Op.getOperand(0);
22118 SDValue B = Op.getOperand(1);
22119
22120 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22121 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22122 unsigned Opcode = Op.getOpcode();
22123 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22124 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22125
22126 // For 512-bit vectors, split into 256-bit vectors to allow the
22127 // sign-extension to occur.
22128 if (VT == MVT::v64i8)
22129 return Lower512IntArith(Op, DAG);
22130
22131 // AVX2 implementations - extend xmm subvectors to ymm.
22132 if (Subtarget.hasInt256()) {
22133 unsigned NumElems = VT.getVectorNumElements();
22134 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22135 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22136
22137 if (VT == MVT::v32i8) {
22138 if (Subtarget.hasBWI()) {
22139 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22140 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22141 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22142 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22143 DAG.getConstant(8, dl, MVT::v32i16));
22144 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22145 }
22146 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22147 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22148 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22149 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22150 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22151 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22152 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22153 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22154 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22155 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22156 DAG.getConstant(8, dl, MVT::v16i16));
22157 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22158 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22159 DAG.getConstant(8, dl, MVT::v16i16));
22160 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22161 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22162 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22163 16, 17, 18, 19, 20, 21, 22, 23};
22164 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22165 24, 25, 26, 27, 28, 29, 30, 31};
22166 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22167 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22168 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22169 }
22170
22171 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22172 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22173 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22174 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22175 DAG.getConstant(8, dl, MVT::v16i16));
22176 // If we have BWI we can use truncate instruction.
22177 if (Subtarget.hasBWI())
22178 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22179 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22180 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22181 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22182 }
22183
22184 assert(VT == MVT::v16i8 &&(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22185, __extension__ __PRETTY_FUNCTION__))
22185 "Pre-AVX2 support only supports v16i8 multiplication")(static_cast <bool> (VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"Pre-AVX2 support only supports v16i8 multiplication\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22185, __extension__ __PRETTY_FUNCTION__))
;
22186 MVT ExVT = MVT::v8i16;
22187 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22188
22189 // Extract the lo parts and zero/sign extend to i16.
22190 SDValue ALo, BLo;
22191 if (Subtarget.hasSSE41()) {
22192 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22193 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22194 } else {
22195 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22196 -1, 4, -1, 5, -1, 6, -1, 7};
22197 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22198 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22199 ALo = DAG.getBitcast(ExVT, ALo);
22200 BLo = DAG.getBitcast(ExVT, BLo);
22201 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22202 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22203 }
22204
22205 // Extract the hi parts and zero/sign extend to i16.
22206 SDValue AHi, BHi;
22207 if (Subtarget.hasSSE41()) {
22208 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22209 -1, -1, -1, -1, -1, -1, -1, -1};
22210 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22211 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22212 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22213 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22214 } else {
22215 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22216 -1, 12, -1, 13, -1, 14, -1, 15};
22217 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22218 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22219 AHi = DAG.getBitcast(ExVT, AHi);
22220 BHi = DAG.getBitcast(ExVT, BHi);
22221 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22222 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22223 }
22224
22225 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22226 // pack back to v16i8.
22227 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22228 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22229 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22230 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22231 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22232}
22233
22234SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22235 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22235, __extension__ __PRETTY_FUNCTION__))
;
22236 EVT VT = Op.getValueType();
22237 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __extension__ __PRETTY_FUNCTION__))
22238 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __extension__ __PRETTY_FUNCTION__))
;
22239
22240 RTLIB::Libcall LC;
22241 bool isSigned;
22242 switch (Op->getOpcode()) {
22243 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22243)
;
22244 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22245 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22246 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22247 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22248 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22249 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22250 }
22251
22252 SDLoc dl(Op);
22253 SDValue InChain = DAG.getEntryNode();
22254
22255 TargetLowering::ArgListTy Args;
22256 TargetLowering::ArgListEntry Entry;
22257 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22258 EVT ArgVT = Op->getOperand(i).getValueType();
22259 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22260, __extension__ __PRETTY_FUNCTION__))
22260 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22260, __extension__ __PRETTY_FUNCTION__))
;
22261 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22262 Entry.Node = StackPtr;
22263 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22264 MachinePointerInfo(), /* Alignment = */ 16);
22265 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22266 Entry.Ty = PointerType::get(ArgTy,0);
22267 Entry.IsSExt = false;
22268 Entry.IsZExt = false;
22269 Args.push_back(Entry);
22270 }
22271
22272 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22273 getPointerTy(DAG.getDataLayout()));
22274
22275 TargetLowering::CallLoweringInfo CLI(DAG);
22276 CLI.setDebugLoc(dl)
22277 .setChain(InChain)
22278 .setLibCallee(
22279 getLibcallCallingConv(LC),
22280 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22281 std::move(Args))
22282 .setInRegister()
22283 .setSExtResult(isSigned)
22284 .setZExtResult(!isSigned);
22285
22286 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22287 return DAG.getBitcast(VT, CallInfo.first);
22288}
22289
22290static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22291 SelectionDAG &DAG) {
22292 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22293 MVT VT = Op0.getSimpleValueType();
22294 SDLoc dl(Op);
22295
22296 // Decompose 256-bit ops into smaller 128-bit ops.
22297 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22298 unsigned Opcode = Op.getOpcode();
22299 unsigned NumElems = VT.getVectorNumElements();
22300 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22301 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22302 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22303 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22304 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22305 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22306 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22307 SDValue Ops[] = {
22308 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22309 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22310 };
22311 return DAG.getMergeValues(Ops, dl);
22312 }
22313
22314 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__))
22315 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__))
22316 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22316, __extension__ __PRETTY_FUNCTION__))
;
22317
22318 int NumElts = VT.getVectorNumElements();
22319
22320 // PMULxD operations multiply each even value (starting at 0) of LHS with
22321 // the related value of RHS and produce a widen result.
22322 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22323 // => <2 x i64> <ae|cg>
22324 //
22325 // In other word, to have all the results, we need to perform two PMULxD:
22326 // 1. one with the even values.
22327 // 2. one with the odd values.
22328 // To achieve #2, with need to place the odd values at an even position.
22329 //
22330 // Place the odd value at an even position (basically, shift all values 1
22331 // step to the left):
22332 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22333 // <a|b|c|d> => <b|undef|d|undef>
22334 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22335 makeArrayRef(&Mask[0], NumElts));
22336 // <e|f|g|h> => <f|undef|h|undef>
22337 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22338 makeArrayRef(&Mask[0], NumElts));
22339
22340 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22341 // ints.
22342 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22343 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22344 unsigned Opcode =
22345 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22346 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22347 // => <2 x i64> <ae|cg>
22348 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22349 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22350 // => <2 x i64> <bf|dh>
22351 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22352
22353 // Shuffle it back into the right order.
22354 SmallVector<int, 16> HighMask(NumElts);
22355 SmallVector<int, 16> LowMask(NumElts);
22356 for (int i = 0; i != NumElts; ++i) {
22357 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22358 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22359 }
22360
22361 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22362 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22363
22364 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22365 // unsigned multiply.
22366 if (IsSigned && !Subtarget.hasSSE41()) {
22367 SDValue ShAmt = DAG.getConstant(
22368 31, dl,
22369 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22370 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22371 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22372 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22373 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22374
22375 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22376 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22377 }
22378
22379 // The first result of MUL_LOHI is actually the low value, followed by the
22380 // high value.
22381 SDValue Ops[] = {Lows, Highs};
22382 return DAG.getMergeValues(Ops, dl);
22383}
22384
22385// Return true if the required (according to Opcode) shift-imm form is natively
22386// supported by the Subtarget
22387static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22388 unsigned Opcode) {
22389 if (VT.getScalarSizeInBits() < 16)
22390 return false;
22391
22392 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22393 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22394 return true;
22395
22396 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22397 (VT.is256BitVector() && Subtarget.hasInt256());
22398
22399 bool AShift = LShift && (Subtarget.hasAVX512() ||
22400 (VT != MVT::v2i64 && VT != MVT::v4i64));
22401 return (Opcode == ISD::SRA) ? AShift : LShift;
22402}
22403
22404// The shift amount is a variable, but it is the same for all vector lanes.
22405// These instructions are defined together with shift-immediate.
22406static
22407bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22408 unsigned Opcode) {
22409 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22410}
22411
22412// Return true if the required (according to Opcode) variable-shift form is
22413// natively supported by the Subtarget
22414static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22415 unsigned Opcode) {
22416
22417 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22418 return false;
22419
22420 // vXi16 supported only on AVX-512, BWI
22421 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22422 return false;
22423
22424 if (Subtarget.hasAVX512())
22425 return true;
22426
22427 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22428 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22429 return (Opcode == ISD::SRA) ? AShift : LShift;
22430}
22431
22432static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22433 const X86Subtarget &Subtarget) {
22434 MVT VT = Op.getSimpleValueType();
22435 SDLoc dl(Op);
22436 SDValue R = Op.getOperand(0);
22437 SDValue Amt = Op.getOperand(1);
22438
22439 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22440 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22441
22442 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22443 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22443, __extension__ __PRETTY_FUNCTION__))
;
22444 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22445 SDValue Ex = DAG.getBitcast(ExVT, R);
22446
22447 // ashr(R, 63) === cmp_slt(R, 0)
22448 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22449 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22450, __extension__ __PRETTY_FUNCTION__))
22450 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22450, __extension__ __PRETTY_FUNCTION__))
;
22451 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22452 getZeroVector(VT, Subtarget, DAG, dl), R);
22453 }
22454
22455 if (ShiftAmt >= 32) {
22456 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22457 SDValue Upper =
22458 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22459 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22460 ShiftAmt - 32, DAG);
22461 if (VT == MVT::v2i64)
22462 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22463 if (VT == MVT::v4i64)
22464 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22465 {9, 1, 11, 3, 13, 5, 15, 7});
22466 } else {
22467 // SRA upper i32, SHL whole i64 and select lower i32.
22468 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22469 ShiftAmt, DAG);
22470 SDValue Lower =
22471 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22472 Lower = DAG.getBitcast(ExVT, Lower);
22473 if (VT == MVT::v2i64)
22474 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22475 if (VT == MVT::v4i64)
22476 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22477 {8, 1, 10, 3, 12, 5, 14, 7});
22478 }
22479 return DAG.getBitcast(VT, Ex);
22480 };
22481
22482 // Optimize shl/srl/sra with constant shift amount.
22483 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22484 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22485 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22486
22487 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22488 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22489
22490 // i64 SRA needs to be performed as partial shifts.
22491 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22492 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22493 Op.getOpcode() == ISD::SRA)
22494 return ArithmeticShiftRight64(ShiftAmt);
22495
22496 if (VT == MVT::v16i8 ||
22497 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22498 VT == MVT::v64i8) {
22499 unsigned NumElts = VT.getVectorNumElements();
22500 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22501
22502 // Simple i8 add case
22503 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22504 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22505
22506 // ashr(R, 7) === cmp_slt(R, 0)
22507 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22508 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22509 if (VT.is512BitVector()) {
22510 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22510, __extension__ __PRETTY_FUNCTION__))
;
22511 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22512 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22513 }
22514 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22515 }
22516
22517 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22518 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22519 return SDValue();
22520
22521 if (Op.getOpcode() == ISD::SHL) {
22522 // Make a large shift.
22523 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22524 R, ShiftAmt, DAG);
22525 SHL = DAG.getBitcast(VT, SHL);
22526 // Zero out the rightmost bits.
22527 return DAG.getNode(ISD::AND, dl, VT, SHL,
22528 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22529 }
22530 if (Op.getOpcode() == ISD::SRL) {
22531 // Make a large shift.
22532 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22533 R, ShiftAmt, DAG);
22534 SRL = DAG.getBitcast(VT, SRL);
22535 // Zero out the leftmost bits.
22536 return DAG.getNode(ISD::AND, dl, VT, SRL,
22537 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22538 }
22539 if (Op.getOpcode() == ISD::SRA) {
22540 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22541 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22542
22543 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22544 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22545 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22546 return Res;
22547 }
22548 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22548)
;
22549 }
22550 }
22551 }
22552
22553 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22554 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22555 if (!Subtarget.hasXOP() &&
22556 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22557 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22558
22559 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22560 unsigned SubVectorScale = 1;
22561 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22562 SubVectorScale =
22563 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22564 Amt = Amt.getOperand(0);
22565 }
22566
22567 // Peek through any splat that was introduced for i64 shift vectorization.
22568 int SplatIndex = -1;
22569 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22570 if (SVN->isSplat()) {
22571 SplatIndex = SVN->getSplatIndex();
22572 Amt = Amt.getOperand(0);
22573 assert(SplatIndex < (int)VT.getVectorNumElements() &&(static_cast <bool> (SplatIndex < (int)VT.getVectorNumElements
() && "Splat shuffle referencing second operand") ? void
(0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22574, __extension__ __PRETTY_FUNCTION__))
22574 "Splat shuffle referencing second operand")(static_cast <bool> (SplatIndex < (int)VT.getVectorNumElements
() && "Splat shuffle referencing second operand") ? void
(0) : __assert_fail ("SplatIndex < (int)VT.getVectorNumElements() && \"Splat shuffle referencing second operand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22574, __extension__ __PRETTY_FUNCTION__))
;
22575 }
22576
22577 if (Amt.getOpcode() != ISD::BITCAST ||
22578 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22579 return SDValue();
22580
22581 Amt = Amt.getOperand(0);
22582 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22583 (SubVectorScale * VT.getVectorNumElements());
22584 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22585 uint64_t ShiftAmt = 0;
22586 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22587 for (unsigned i = 0; i != Ratio; ++i) {
22588 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22589 if (!C)
22590 return SDValue();
22591 // 6 == Log2(64)
22592 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22593 }
22594
22595 // Check remaining shift amounts (if not a splat).
22596 if (SplatIndex < 0) {
22597 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22598 uint64_t ShAmt = 0;
22599 for (unsigned j = 0; j != Ratio; ++j) {
22600 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22601 if (!C)
22602 return SDValue();
22603 // 6 == Log2(64)
22604 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22605 }
22606 if (ShAmt != ShiftAmt)
22607 return SDValue();
22608 }
22609 }
22610
22611 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22612 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22613
22614 if (Op.getOpcode() == ISD::SRA)
22615 return ArithmeticShiftRight64(ShiftAmt);
22616 }
22617
22618 return SDValue();
22619}
22620
22621static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22622 const X86Subtarget &Subtarget) {
22623 MVT VT = Op.getSimpleValueType();
22624 SDLoc dl(Op);
22625 SDValue R = Op.getOperand(0);
22626 SDValue Amt = Op.getOperand(1);
22627
22628 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22629 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22630
22631 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22632 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22633
22634 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22635 SDValue BaseShAmt;
22636 MVT EltVT = VT.getVectorElementType();
22637
22638 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22639 // Check if this build_vector node is doing a splat.
22640 // If so, then set BaseShAmt equal to the splat value.
22641 BaseShAmt = BV->getSplatValue();
22642 if (BaseShAmt && BaseShAmt.isUndef())
22643 BaseShAmt = SDValue();
22644 } else {
22645 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22646 Amt = Amt.getOperand(0);
22647
22648 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22649 if (SVN && SVN->isSplat()) {
22650 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22651 SDValue InVec = Amt.getOperand(0);
22652 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22653 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&(static_cast <bool> ((SplatIdx < InVec.getSimpleValueType
().getVectorNumElements()) && "Unexpected shuffle index found!"
) ? void (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22654, __extension__ __PRETTY_FUNCTION__))
22654 "Unexpected shuffle index found!")(static_cast <bool> ((SplatIdx < InVec.getSimpleValueType
().getVectorNumElements()) && "Unexpected shuffle index found!"
) ? void (0) : __assert_fail ("(SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && \"Unexpected shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22654, __extension__ __PRETTY_FUNCTION__))
;
22655 BaseShAmt = InVec.getOperand(SplatIdx);
22656 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22657 if (ConstantSDNode *C =
22658 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22659 if (C->getZExtValue() == SplatIdx)
22660 BaseShAmt = InVec.getOperand(1);
22661 }
22662 }
22663
22664 if (!BaseShAmt)
22665 // Avoid introducing an extract element from a shuffle.
22666 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22667 DAG.getIntPtrConstant(SplatIdx, dl));
22668 }
22669 }
22670
22671 if (BaseShAmt.getNode()) {
22672 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")(static_cast <bool> (EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? void (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22672, __extension__ __PRETTY_FUNCTION__))
;
22673 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22674 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22675 else if (EltVT.bitsLT(MVT::i32))
22676 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22677
22678 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22679 }
22680 }
22681
22682 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22683 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22684 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22685 Amt = Amt.getOperand(0);
22686 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22687 VT.getVectorNumElements();
22688 std::vector<SDValue> Vals(Ratio);
22689 for (unsigned i = 0; i != Ratio; ++i)
22690 Vals[i] = Amt.getOperand(i);
22691 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22692 for (unsigned j = 0; j != Ratio; ++j)
22693 if (Vals[j] != Amt.getOperand(i + j))
22694 return SDValue();
22695 }
22696
22697 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22698 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22699 }
22700 return SDValue();
22701}
22702
22703static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22704 SelectionDAG &DAG) {
22705 MVT VT = Op.getSimpleValueType();
22706 SDLoc dl(Op);
22707 SDValue R = Op.getOperand(0);
22708 SDValue Amt = Op.getOperand(1);
22709 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22710
22711 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22711, __extension__ __PRETTY_FUNCTION__))
;
22712 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22712, __extension__ __PRETTY_FUNCTION__))
;
22713
22714 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22715 return V;
22716
22717 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22718 return V;
22719
22720 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22721 return Op;
22722
22723 // XOP has 128-bit variable logical/arithmetic shifts.
22724 // +ve/-ve Amt = shift left/right.
22725 if (Subtarget.hasXOP() &&
22726 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22727 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22728 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22729 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22730 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22731 }
22732 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22733 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22734 if (Op.getOpcode() == ISD::SRA)
22735 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22736 }
22737
22738 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22739 // shifts per-lane and then shuffle the partial results back together.
22740 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22741 // Splat the shift amounts so the scalar shifts above will catch it.
22742 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22743 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22744 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22745 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22746 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22747 }
22748
22749 // i64 vector arithmetic shift can be emulated with the transform:
22750 // M = lshr(SIGN_MASK, Amt)
22751 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22752 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22753 Op.getOpcode() == ISD::SRA) {
22754 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22755 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22756 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22757 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22758 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22759 return R;
22760 }
22761
22762 // If possible, lower this packed shift into a vector multiply instead of
22763 // expanding it into a sequence of scalar shifts.
22764 // Do this only if the vector shift count is a constant build_vector.
22765 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22766 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22767 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22768 SmallVector<SDValue, 8> Elts;
22769 MVT SVT = VT.getVectorElementType();
22770 unsigned SVTBits = SVT.getSizeInBits();
22771 APInt One(SVTBits, 1);
22772 unsigned NumElems = VT.getVectorNumElements();
22773
22774 for (unsigned i=0; i !=NumElems; ++i) {
22775 SDValue Op = Amt->getOperand(i);
22776 if (Op->isUndef()) {
22777 Elts.push_back(Op);
22778 continue;
22779 }
22780
22781 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22782 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22783 uint64_t ShAmt = C.getZExtValue();
22784 if (ShAmt >= SVTBits) {
22785 Elts.push_back(DAG.getUNDEF(SVT));
22786 continue;
22787 }
22788 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22789 }
22790 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22791 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22792 }
22793
22794 // Lower SHL with variable shift amount.
22795 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22796 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22797
22798 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22799 DAG.getConstant(0x3f800000U, dl, VT));
22800 Op = DAG.getBitcast(MVT::v4f32, Op);
22801 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22802 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22803 }
22804
22805 // If possible, lower this shift as a sequence of two shifts by
22806 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22807 // Example:
22808 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22809 //
22810 // Could be rewritten as:
22811 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22812 //
22813 // The advantage is that the two shifts from the example would be
22814 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22815 // the vector shift into four scalar shifts plus four pairs of vector
22816 // insert/extract.
22817 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22818 bool UseMOVSD = false;
22819 bool CanBeSimplified;
22820 // The splat value for the first packed shift (the 'X' from the example).
22821 SDValue Amt1 = Amt->getOperand(0);
22822 // The splat value for the second packed shift (the 'Y' from the example).
22823 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22824
22825 // See if it is possible to replace this node with a sequence of
22826 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22827 if (VT == MVT::v4i32) {
22828 // Check if it is legal to use a MOVSS.
22829 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22830 Amt2 == Amt->getOperand(3);
22831 if (!CanBeSimplified) {
22832 // Otherwise, check if we can still simplify this node using a MOVSD.
22833 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22834 Amt->getOperand(2) == Amt->getOperand(3);
22835 UseMOVSD = true;
22836 Amt2 = Amt->getOperand(2);
22837 }
22838 } else {
22839 // Do similar checks for the case where the machine value type
22840 // is MVT::v8i16.
22841 CanBeSimplified = Amt1 == Amt->getOperand(1);
22842 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22843 CanBeSimplified = Amt2 == Amt->getOperand(i);
22844
22845 if (!CanBeSimplified) {
22846 UseMOVSD = true;
22847 CanBeSimplified = true;
22848 Amt2 = Amt->getOperand(4);
22849 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22850 CanBeSimplified = Amt1 == Amt->getOperand(i);
22851 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22852 CanBeSimplified = Amt2 == Amt->getOperand(j);
22853 }
22854 }
22855
22856 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22857 isa<ConstantSDNode>(Amt2)) {
22858 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22859 SDValue Splat1 =
22860 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22861 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22862 SDValue Splat2 =
22863 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22864 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22865 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
22866 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
22867 if (UseMOVSD)
22868 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22869 BitCast2, {0, 1, 6, 7}));
22870 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22871 BitCast2, {0, 5, 6, 7}));
22872 }
22873 }
22874
22875 // v4i32 Non Uniform Shifts.
22876 // If the shift amount is constant we can shift each lane using the SSE2
22877 // immediate shifts, else we need to zero-extend each lane to the lower i64
22878 // and shift using the SSE2 variable shifts.
22879 // The separate results can then be blended together.
22880 if (VT == MVT::v4i32) {
22881 unsigned Opc = Op.getOpcode();
22882 SDValue Amt0, Amt1, Amt2, Amt3;
22883 if (ConstantAmt) {
22884 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22885 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22886 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22887 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22888 } else {
22889 // ISD::SHL is handled above but we include it here for completeness.
22890 switch (Opc) {
22891 default:
22892 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 22892)
;
22893 case ISD::SHL:
22894 Opc = X86ISD::VSHL;
22895 break;
22896 case ISD::SRL:
22897 Opc = X86ISD::VSRL;
22898 break;
22899 case ISD::SRA:
22900 Opc = X86ISD::VSRA;
22901 break;
22902 }
22903 // The SSE2 shifts use the lower i64 as the same shift amount for
22904 // all lanes and the upper i64 is ignored. These shuffle masks
22905 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22906 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22907 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22908 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22909 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22910 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22911 }
22912
22913 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22914 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22915 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22916 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22917 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22918 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22919 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22920 }
22921
22922 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22923 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22924 // make the existing SSE solution better.
22925 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22926 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22927 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22928 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22929 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22930 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22931 unsigned ExtOpc =
22932 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22933 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22934 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22935 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22936 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22937 }
22938
22939 if (VT == MVT::v16i8 ||
22940 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22941 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22942 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22943 unsigned ShiftOpcode = Op->getOpcode();
22944
22945 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22946 if (VT.is512BitVector()) {
22947 // On AVX512BW targets we make use of the fact that VSELECT lowers
22948 // to a masked blend which selects bytes based just on the sign bit
22949 // extracted to a mask.
22950 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22951 V0 = DAG.getBitcast(VT, V0);
22952 V1 = DAG.getBitcast(VT, V1);
22953 Sel = DAG.getBitcast(VT, Sel);
22954 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22955 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22956 } else if (Subtarget.hasSSE41()) {
22957 // On SSE41 targets we make use of the fact that VSELECT lowers
22958 // to PBLENDVB which selects bytes based just on the sign bit.
22959 V0 = DAG.getBitcast(VT, V0);
22960 V1 = DAG.getBitcast(VT, V1);
22961 Sel = DAG.getBitcast(VT, Sel);
22962 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22963 }
22964 // On pre-SSE41 targets we test for the sign bit by comparing to
22965 // zero - a negative value will set all bits of the lanes to true
22966 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22967 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22968 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22969 return DAG.getSelect(dl, SelVT, C, V0, V1);
22970 };
22971
22972 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22973 // We can safely do this using i16 shifts as we're only interested in
22974 // the 3 lower bits of each byte.
22975 Amt = DAG.getBitcast(ExtVT, Amt);
22976 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22977 Amt = DAG.getBitcast(VT, Amt);
22978
22979 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22980 // r = VSELECT(r, shift(r, 4), a);
22981 SDValue M =
22982 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22983 R = SignBitSelect(VT, Amt, M, R);
22984
22985 // a += a
22986 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22987
22988 // r = VSELECT(r, shift(r, 2), a);
22989 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22990 R = SignBitSelect(VT, Amt, M, R);
22991
22992 // a += a
22993 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22994
22995 // return VSELECT(r, shift(r, 1), a);
22996 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22997 R = SignBitSelect(VT, Amt, M, R);
22998 return R;
22999 }
23000
23001 if (Op->getOpcode() == ISD::SRA) {
23002 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23003 // so we can correctly sign extend. We don't care what happens to the
23004 // lower byte.
23005 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23006 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23007 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23008 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23009 ALo = DAG.getBitcast(ExtVT, ALo);
23010 AHi = DAG.getBitcast(ExtVT, AHi);
23011 RLo = DAG.getBitcast(ExtVT, RLo);
23012 RHi = DAG.getBitcast(ExtVT, RHi);
23013
23014 // r = VSELECT(r, shift(r, 4), a);
23015 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23016 DAG.getConstant(4, dl, ExtVT));
23017 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23018 DAG.getConstant(4, dl, ExtVT));
23019 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23020 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23021
23022 // a += a
23023 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23024 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23025
23026 // r = VSELECT(r, shift(r, 2), a);
23027 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23028 DAG.getConstant(2, dl, ExtVT));
23029 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23030 DAG.getConstant(2, dl, ExtVT));
23031 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23032 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23033
23034 // a += a
23035 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23036 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23037
23038 // r = VSELECT(r, shift(r, 1), a);
23039 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23040 DAG.getConstant(1, dl, ExtVT));
23041 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23042 DAG.getConstant(1, dl, ExtVT));
23043 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23044 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23045
23046 // Logical shift the result back to the lower byte, leaving a zero upper
23047 // byte
23048 // meaning that we can safely pack with PACKUSWB.
23049 RLo =
23050 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23051 RHi =
23052 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23053 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23054 }
23055 }
23056
23057 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23058 MVT ExtVT = MVT::v8i32;
23059 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23060 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23061 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23062 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23063 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23064 ALo = DAG.getBitcast(ExtVT, ALo);
23065 AHi = DAG.getBitcast(ExtVT, AHi);
23066 RLo = DAG.getBitcast(ExtVT, RLo);
23067 RHi = DAG.getBitcast(ExtVT, RHi);
23068 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23069 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23070 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23071 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23072 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23073 }
23074
23075 if (VT == MVT::v8i16) {
23076 unsigned ShiftOpcode = Op->getOpcode();
23077
23078 // If we have a constant shift amount, the non-SSE41 path is best as
23079 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23080 bool UseSSE41 = Subtarget.hasSSE41() &&
23081 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23082
23083 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23084 // On SSE41 targets we make use of the fact that VSELECT lowers
23085 // to PBLENDVB which selects bytes based just on the sign bit.
23086 if (UseSSE41) {
23087 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23088 V0 = DAG.getBitcast(ExtVT, V0);
23089 V1 = DAG.getBitcast(ExtVT, V1);
23090 Sel = DAG.getBitcast(ExtVT, Sel);
23091 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23092 }
23093 // On pre-SSE41 targets we splat the sign bit - a negative value will
23094 // set all bits of the lanes to true and VSELECT uses that in
23095 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23096 SDValue C =
23097 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23098 return DAG.getSelect(dl, VT, C, V0, V1);
23099 };
23100
23101 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23102 if (UseSSE41) {
23103 // On SSE41 targets we need to replicate the shift mask in both
23104 // bytes for PBLENDVB.
23105 Amt = DAG.getNode(
23106 ISD::OR, dl, VT,
23107 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23108 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23109 } else {
23110 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23111 }
23112
23113 // r = VSELECT(r, shift(r, 8), a);
23114 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23115 R = SignBitSelect(Amt, M, R);
23116
23117 // a += a
23118 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23119
23120 // r = VSELECT(r, shift(r, 4), a);
23121 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23122 R = SignBitSelect(Amt, M, R);
23123
23124 // a += a
23125 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23126
23127 // r = VSELECT(r, shift(r, 2), a);
23128 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23129 R = SignBitSelect(Amt, M, R);
23130
23131 // a += a
23132 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23133
23134 // return VSELECT(r, shift(r, 1), a);
23135 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23136 R = SignBitSelect(Amt, M, R);
23137 return R;
23138 }
23139
23140 // Decompose 256-bit shifts into smaller 128-bit shifts.
23141 if (VT.is256BitVector())
23142 return Lower256IntArith(Op, DAG);
23143
23144 return SDValue();
23145}
23146
23147static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23148 SelectionDAG &DAG) {
23149 MVT VT = Op.getSimpleValueType();
23150 SDLoc DL(Op);
23151 SDValue R = Op.getOperand(0);
23152 SDValue Amt = Op.getOperand(1);
23153 unsigned Opcode = Op.getOpcode();
23154 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23155
23156 if (Subtarget.hasAVX512()) {
23157 // Attempt to rotate by immediate.
23158 APInt UndefElts;
23159 SmallVector<APInt, 16> EltBits;
23160 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23161 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23162 return EltBits[0] == V;
23163 })) {
23164 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23165 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23166 return DAG.getNode(Op, DL, VT, R,
23167 DAG.getConstant(RotateAmt, DL, MVT::i8));
23168 }
23169 }
23170
23171 // Else, fall-back on VPROLV/VPRORV.
23172 return Op;
23173 }
23174
23175 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23175, __extension__ __PRETTY_FUNCTION__))
;
23176 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!")(static_cast <bool> (Subtarget.hasXOP() && "XOP support required for vector rotates!"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"XOP support required for vector rotates!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23176, __extension__ __PRETTY_FUNCTION__))
;
23177 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(static_cast <bool> ((Opcode == ISD::ROTL) && "Only ROTL supported"
) ? void (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23177, __extension__ __PRETTY_FUNCTION__))
;
23178
23179 // XOP has 128-bit vector variable + immediate rotates.
23180 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23181
23182 // Split 256-bit integers.
23183 if (VT.is256BitVector())
23184 return Lower256IntArith(Op, DAG);
23185
23186 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23186, __extension__ __PRETTY_FUNCTION__))
;
23187
23188 // Attempt to rotate by immediate.
23189 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23190 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23191 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23192 assert(RotateAmt < EltSizeInBits && "Rotation out of range")(static_cast <bool> (RotateAmt < EltSizeInBits &&
"Rotation out of range") ? void (0) : __assert_fail ("RotateAmt < EltSizeInBits && \"Rotation out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23192, __extension__ __PRETTY_FUNCTION__))
;
23193 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23194 DAG.getConstant(RotateAmt, DL, MVT::i8));
23195 }
23196 }
23197
23198 // Use general rotate by variable (per-element).
23199 return Op;
23200}
23201
23202static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23203 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23204 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23205 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23206 // has only one use.
23207 SDNode *N = Op.getNode();
23208 SDValue LHS = N->getOperand(0);
23209 SDValue RHS = N->getOperand(1);
23210 unsigned BaseOp = 0;
23211 X86::CondCode Cond;
23212 SDLoc DL(Op);
23213 switch (Op.getOpcode()) {
23214 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23214)
;
23215 case ISD::SADDO:
23216 // A subtract of one will be selected as a INC. Note that INC doesn't
23217 // set CF, so we can't do this for UADDO.
23218 if (isOneConstant(RHS)) {
23219 BaseOp = X86ISD::INC;
23220 Cond = X86::COND_O;
23221 break;
23222 }
23223 BaseOp = X86ISD::ADD;
23224 Cond = X86::COND_O;
23225 break;
23226 case ISD::UADDO:
23227 BaseOp = X86ISD::ADD;
23228 Cond = X86::COND_B;
23229 break;
23230 case ISD::SSUBO:
23231 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23232 // set CF, so we can't do this for USUBO.
23233 if (isOneConstant(RHS)) {
23234 BaseOp = X86ISD::DEC;
23235 Cond = X86::COND_O;
23236 break;
23237 }
23238 BaseOp = X86ISD::SUB;
23239 Cond = X86::COND_O;
23240 break;
23241 case ISD::USUBO:
23242 BaseOp = X86ISD::SUB;
23243 Cond = X86::COND_B;
23244 break;
23245 case ISD::SMULO:
23246 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23247 Cond = X86::COND_O;
23248 break;
23249 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23250 if (N->getValueType(0) == MVT::i8) {
23251 BaseOp = X86ISD::UMUL8;
23252 Cond = X86::COND_O;
23253 break;
23254 }
23255 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23256 MVT::i32);
23257 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23258
23259 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23260
23261 if (N->getValueType(1) == MVT::i1)
23262 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23263
23264 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23265 }
23266 }
23267
23268 // Also sets EFLAGS.
23269 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23270 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23271
23272 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23273
23274 if (N->getValueType(1) == MVT::i1)
23275 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23276
23277 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23278}
23279
23280/// Returns true if the operand type is exactly twice the native width, and
23281/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23282/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23283/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23284bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23285 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23286
23287 if (OpWidth == 64)
23288 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23289 else if (OpWidth == 128)
23290 return Subtarget.hasCmpxchg16b();
23291 else
23292 return false;
23293}
23294
23295bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23296 return needsCmpXchgNb(SI->getValueOperand()->getType());
23297}
23298
23299// Note: this turns large loads into lock cmpxchg8b/16b.
23300// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23301TargetLowering::AtomicExpansionKind
23302X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23303 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23304 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23305 : AtomicExpansionKind::None;
23306}
23307
23308TargetLowering::AtomicExpansionKind
23309X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23310 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23311 Type *MemType = AI->getType();
23312
23313 // If the operand is too big, we must see if cmpxchg8/16b is available
23314 // and default to library calls otherwise.
23315 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23316 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23317 : AtomicExpansionKind::None;
23318 }
23319
23320 AtomicRMWInst::BinOp Op = AI->getOperation();
23321 switch (Op) {
23322 default:
23323 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23323)
;
23324 case AtomicRMWInst::Xchg:
23325 case AtomicRMWInst::Add:
23326 case AtomicRMWInst::Sub:
23327 // It's better to use xadd, xsub or xchg for these in all cases.
23328 return AtomicExpansionKind::None;
23329 case AtomicRMWInst::Or:
23330 case AtomicRMWInst::And:
23331 case AtomicRMWInst::Xor:
23332 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23333 // prefix to a normal instruction for these operations.
23334 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23335 : AtomicExpansionKind::None;
23336 case AtomicRMWInst::Nand:
23337 case AtomicRMWInst::Max:
23338 case AtomicRMWInst::Min:
23339 case AtomicRMWInst::UMax:
23340 case AtomicRMWInst::UMin:
23341 // These always require a non-trivial set of data operations on x86. We must
23342 // use a cmpxchg loop.
23343 return AtomicExpansionKind::CmpXChg;
23344 }
23345}
23346
23347LoadInst *
23348X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23349 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23350 Type *MemType = AI->getType();
23351 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23352 // there is no benefit in turning such RMWs into loads, and it is actually
23353 // harmful as it introduces a mfence.
23354 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23355 return nullptr;
23356
23357 auto Builder = IRBuilder<>(AI);
23358 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23359 auto SSID = AI->getSyncScopeID();
23360 // We must restrict the ordering to avoid generating loads with Release or
23361 // ReleaseAcquire orderings.
23362 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23363 auto Ptr = AI->getPointerOperand();
23364
23365 // Before the load we need a fence. Here is an example lifted from
23366 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23367 // is required:
23368 // Thread 0:
23369 // x.store(1, relaxed);
23370 // r1 = y.fetch_add(0, release);
23371 // Thread 1:
23372 // y.fetch_add(42, acquire);
23373 // r2 = x.load(relaxed);
23374 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23375 // lowered to just a load without a fence. A mfence flushes the store buffer,
23376 // making the optimization clearly correct.
23377 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23378 // otherwise, we might be able to be more aggressive on relaxed idempotent
23379 // rmw. In practice, they do not look useful, so we don't try to be
23380 // especially clever.
23381 if (SSID == SyncScope::SingleThread)
23382 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23383 // the IR level, so we must wrap it in an intrinsic.
23384 return nullptr;
23385
23386 if (!Subtarget.hasMFence())
23387 // FIXME: it might make sense to use a locked operation here but on a
23388 // different cache-line to prevent cache-line bouncing. In practice it
23389 // is probably a small win, and x86 processors without mfence are rare
23390 // enough that we do not bother.
23391 return nullptr;
23392
23393 Function *MFence =
23394 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23395 Builder.CreateCall(MFence, {});
23396
23397 // Finally we can emit the atomic load.
23398 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23399 AI->getType()->getPrimitiveSizeInBits());
23400 Loaded->setAtomic(Order, SSID);
23401 AI->replaceAllUsesWith(Loaded);
23402 AI->eraseFromParent();
23403 return Loaded;
23404}
23405
23406static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23407 SelectionDAG &DAG) {
23408 SDLoc dl(Op);
23409 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23410 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23411 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23412 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23413
23414 // The only fence that needs an instruction is a sequentially-consistent
23415 // cross-thread fence.
23416 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23417 FenceSSID == SyncScope::System) {
23418 if (Subtarget.hasMFence())
23419 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23420
23421 SDValue Chain = Op.getOperand(0);
23422 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23423 SDValue Ops[] = {
23424 DAG.getRegister(X86::ESP, MVT::i32), // Base
23425 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23426 DAG.getRegister(0, MVT::i32), // Index
23427 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23428 DAG.getRegister(0, MVT::i32), // Segment.
23429 Zero,
23430 Chain
23431 };
23432 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23433 return SDValue(Res, 0);
23434 }
23435
23436 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23437 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23438}
23439
23440static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23441 SelectionDAG &DAG) {
23442 MVT T = Op.getSimpleValueType();
23443 SDLoc DL(Op);
23444 unsigned Reg = 0;
23445 unsigned size = 0;
23446 switch(T.SimpleTy) {
23447 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23447)
;
23448 case MVT::i8: Reg = X86::AL; size = 1; break;
23449 case MVT::i16: Reg = X86::AX; size = 2; break;
23450 case MVT::i32: Reg = X86::EAX; size = 4; break;
23451 case MVT::i64:
23452 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23452, __extension__ __PRETTY_FUNCTION__))
;
23453 Reg = X86::RAX; size = 8;
23454 break;
23455 }
23456 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23457 Op.getOperand(2), SDValue());
23458 SDValue Ops[] = { cpIn.getValue(0),
23459 Op.getOperand(1),
23460 Op.getOperand(3),
23461 DAG.getTargetConstant(size, DL, MVT::i8),
23462 cpIn.getValue(1) };
23463 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23464 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23465 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23466 Ops, T, MMO);
23467
23468 SDValue cpOut =
23469 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23470 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23471 MVT::i32, cpOut.getValue(2));
23472 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23473
23474 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23475 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23476 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23477 return SDValue();
23478}
23479
23480static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23481 SelectionDAG &DAG) {
23482 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23483 MVT DstVT = Op.getSimpleValueType();
23484
23485 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23486 SrcVT == MVT::i64) {
23487 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23487, __extension__ __PRETTY_FUNCTION__))
;
23488 if (DstVT != MVT::f64)
23489 // This conversion needs to be expanded.
23490 return SDValue();
23491
23492 SDValue Op0 = Op->getOperand(0);
23493 SmallVector<SDValue, 16> Elts;
23494 SDLoc dl(Op);
23495 unsigned NumElts;
23496 MVT SVT;
23497 if (SrcVT.isVector()) {
23498 NumElts = SrcVT.getVectorNumElements();
23499 SVT = SrcVT.getVectorElementType();
23500
23501 // Widen the vector in input in the case of MVT::v2i32.
23502 // Example: from MVT::v2i32 to MVT::v4i32.
23503 for (unsigned i = 0, e = NumElts; i != e; ++i)
23504 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23505 DAG.getIntPtrConstant(i, dl)));
23506 } else {
23507 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23508, __extension__ __PRETTY_FUNCTION__))
23508 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23508, __extension__ __PRETTY_FUNCTION__))
;
23509 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23510 DAG.getIntPtrConstant(0, dl)));
23511 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23512 DAG.getIntPtrConstant(1, dl)));
23513 NumElts = 2;
23514 SVT = MVT::i32;
23515 }
23516 // Explicitly mark the extra elements as Undef.
23517 Elts.append(NumElts, DAG.getUNDEF(SVT));
23518
23519 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23520 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23521 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23523 DAG.getIntPtrConstant(0, dl));
23524 }
23525
23526 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&(static_cast <bool> (Subtarget.is64Bit() && !Subtarget
.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23527, __extension__ __PRETTY_FUNCTION__))
23527 Subtarget.hasMMX() && "Unexpected custom BITCAST")(static_cast <bool> (Subtarget.is64Bit() && !Subtarget
.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23527, __extension__ __PRETTY_FUNCTION__))
;
23528 assert((DstVT == MVT::i64 ||(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__))
23529 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__))
23530 "Unexpected custom BITCAST")(static_cast <bool> ((DstVT == MVT::i64 || (DstVT.isVector
() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"
) ? void (0) : __assert_fail ("(DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && \"Unexpected custom BITCAST\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23530, __extension__ __PRETTY_FUNCTION__))
;
23531 // i64 <=> MMX conversions are Legal.
23532 if (SrcVT==MVT::i64 && DstVT.isVector())
23533 return Op;
23534 if (DstVT==MVT::i64 && SrcVT.isVector())
23535 return Op;
23536 // MMX <=> MMX conversions are Legal.
23537 if (SrcVT.isVector() && DstVT.isVector())
23538 return Op;
23539 // All other conversions need to be expanded.
23540 return SDValue();
23541}
23542
23543/// Compute the horizontal sum of bytes in V for the elements of VT.
23544///
23545/// Requires V to be a byte vector and VT to be an integer vector type with
23546/// wider elements than V's type. The width of the elements of VT determines
23547/// how many bytes of V are summed horizontally to produce each element of the
23548/// result.
23549static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23550 const X86Subtarget &Subtarget,
23551 SelectionDAG &DAG) {
23552 SDLoc DL(V);
23553 MVT ByteVecVT = V.getSimpleValueType();
23554 MVT EltVT = VT.getVectorElementType();
23555 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23556, __extension__ __PRETTY_FUNCTION__))
23556 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23556, __extension__ __PRETTY_FUNCTION__))
;
23557 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23558, __extension__ __PRETTY_FUNCTION__))
23558 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23558, __extension__ __PRETTY_FUNCTION__))
;
23559 unsigned VecSize = VT.getSizeInBits();
23560 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23560, __extension__ __PRETTY_FUNCTION__))
;
23561
23562 // PSADBW instruction horizontally add all bytes and leave the result in i64
23563 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23564 if (EltVT == MVT::i64) {
23565 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23566 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23567 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23568 return DAG.getBitcast(VT, V);
23569 }
23570
23571 if (EltVT == MVT::i32) {
23572 // We unpack the low half and high half into i32s interleaved with zeros so
23573 // that we can use PSADBW to horizontally sum them. The most useful part of
23574 // this is that it lines up the results of two PSADBW instructions to be
23575 // two v2i64 vectors which concatenated are the 4 population counts. We can
23576 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23577 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23578 SDValue V32 = DAG.getBitcast(VT, V);
23579 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23580 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23581
23582 // Do the horizontal sums into two v2i64s.
23583 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23584 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23585 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23586 DAG.getBitcast(ByteVecVT, Low), Zeros);
23587 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23588 DAG.getBitcast(ByteVecVT, High), Zeros);
23589
23590 // Merge them together.
23591 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23592 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23593 DAG.getBitcast(ShortVecVT, Low),
23594 DAG.getBitcast(ShortVecVT, High));
23595
23596 return DAG.getBitcast(VT, V);
23597 }
23598
23599 // The only element type left is i16.
23600 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23600, __extension__ __PRETTY_FUNCTION__))
;
23601
23602 // To obtain pop count for each i16 element starting from the pop count for
23603 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23604 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23605 // directly supported.
23606 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23607 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23608 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23609 DAG.getBitcast(ByteVecVT, V));
23610 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23611}
23612
23613static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23614 const X86Subtarget &Subtarget,
23615 SelectionDAG &DAG) {
23616 MVT VT = Op.getSimpleValueType();
23617 MVT EltVT = VT.getVectorElementType();
23618 unsigned VecSize = VT.getSizeInBits();
23619
23620 // Implement a lookup table in register by using an algorithm based on:
23621 // http://wm.ite.pl/articles/sse-popcount.html
23622 //
23623 // The general idea is that every lower byte nibble in the input vector is an
23624 // index into a in-register pre-computed pop count table. We then split up the
23625 // input vector in two new ones: (1) a vector with only the shifted-right
23626 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23627 // masked out higher ones) for each byte. PSHUFB is used separately with both
23628 // to index the in-register table. Next, both are added and the result is a
23629 // i8 vector where each element contains the pop count for input byte.
23630 //
23631 // To obtain the pop count for elements != i8, we follow up with the same
23632 // approach and use additional tricks as described below.
23633 //
23634 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23635 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23636 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23637 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23638
23639 int NumByteElts = VecSize / 8;
23640 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23641 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23642 SmallVector<SDValue, 64> LUTVec;
23643 for (int i = 0; i < NumByteElts; ++i)
23644 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23645 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23646 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23647
23648 // High nibbles
23649 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23650 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23651
23652 // Low nibbles
23653 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23654
23655 // The input vector is used as the shuffle mask that index elements into the
23656 // LUT. After counting low and high nibbles, add the vector to obtain the
23657 // final pop count per i8 element.
23658 SDValue HighPopCnt =
23659 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23660 SDValue LowPopCnt =
23661 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23662 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23663
23664 if (EltVT == MVT::i8)
23665 return PopCnt;
23666
23667 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23668}
23669
23670static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23671 const X86Subtarget &Subtarget,
23672 SelectionDAG &DAG) {
23673 MVT VT = Op.getSimpleValueType();
23674 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23675, __extension__ __PRETTY_FUNCTION__))
23675 "Only 128-bit vector bitmath lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitmath lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23675, __extension__ __PRETTY_FUNCTION__))
;
23676
23677 int VecSize = VT.getSizeInBits();
23678 MVT EltVT = VT.getVectorElementType();
23679 int Len = EltVT.getSizeInBits();
23680
23681 // This is the vectorized version of the "best" algorithm from
23682 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23683 // with a minor tweak to use a series of adds + shifts instead of vector
23684 // multiplications. Implemented for all integer vector types. We only use
23685 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23686 // much faster, even faster than using native popcnt instructions.
23687
23688 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23689 MVT VT = V.getSimpleValueType();
23690 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23691 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23692 };
23693 auto GetMask = [&](SDValue V, APInt Mask) {
23694 MVT VT = V.getSimpleValueType();
23695 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23696 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23697 };
23698
23699 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23700 // x86, so set the SRL type to have elements at least i16 wide. This is
23701 // correct because all of our SRLs are followed immediately by a mask anyways
23702 // that handles any bits that sneak into the high bits of the byte elements.
23703 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23704
23705 SDValue V = Op;
23706
23707 // v = v - ((v >> 1) & 0x55555555...)
23708 SDValue Srl =
23709 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23710 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23711 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23712
23713 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23714 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23715 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23716 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23717 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23718
23719 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23720 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23721 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23722 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23723
23724 // At this point, V contains the byte-wise population count, and we are
23725 // merely doing a horizontal sum if necessary to get the wider element
23726 // counts.
23727 if (EltVT == MVT::i8)
23728 return V;
23729
23730 return LowerHorizontalByteSum(
23731 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23732 DAG);
23733}
23734
23735// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23736// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23737static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23738 SelectionDAG &DAG) {
23739 MVT VT = Op.getSimpleValueType();
23740 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23741, __extension__ __PRETTY_FUNCTION__))
23741 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23741, __extension__ __PRETTY_FUNCTION__))
;
23742 SDLoc DL(Op.getNode());
23743 SDValue Op0 = Op.getOperand(0);
23744
23745 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23746 if (Subtarget.hasVPOPCNTDQ()) {
23747 if (VT == MVT::v8i16) {
23748 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23749 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23750 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23751 }
23752 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23753 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23754 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23755 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23756 }
23757 }
23758
23759 if (!Subtarget.hasSSSE3()) {
23760 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23761 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported in SSE!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23761, __extension__ __PRETTY_FUNCTION__))
;
23762 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23763 }
23764
23765 // Decompose 256-bit ops into smaller 128-bit ops.
23766 if (VT.is256BitVector() && !Subtarget.hasInt256())
23767 return Lower256IntUnary(Op, DAG);
23768
23769 // Decompose 512-bit ops into smaller 256-bit ops.
23770 if (VT.is512BitVector() && !Subtarget.hasBWI())
23771 return Lower512IntUnary(Op, DAG);
23772
23773 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23774}
23775
23776static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23777 SelectionDAG &DAG) {
23778 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23779, __extension__ __PRETTY_FUNCTION__))
23779 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23779, __extension__ __PRETTY_FUNCTION__))
;
23780 return LowerVectorCTPOP(Op, Subtarget, DAG);
23781}
23782
23783static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23784 MVT VT = Op.getSimpleValueType();
23785 SDValue In = Op.getOperand(0);
23786 SDLoc DL(Op);
23787
23788 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23789 // perform the BITREVERSE.
23790 if (!VT.isVector()) {
23791 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23792 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23793 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23794 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23795 DAG.getIntPtrConstant(0, DL));
23796 }
23797
23798 int NumElts = VT.getVectorNumElements();
23799 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23800
23801 // Decompose 256-bit ops into smaller 128-bit ops.
23802 if (VT.is256BitVector())
23803 return Lower256IntUnary(Op, DAG);
23804
23805 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23806, __extension__ __PRETTY_FUNCTION__))
23806 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23806, __extension__ __PRETTY_FUNCTION__))
;
23807
23808 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23809 // perform the BSWAP in the shuffle.
23810 // Its best to shuffle using the second operand as this will implicitly allow
23811 // memory folding for multiple vectors.
23812 SmallVector<SDValue, 16> MaskElts;
23813 for (int i = 0; i != NumElts; ++i) {
23814 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23815 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23816 int PermuteByte = SourceByte | (2 << 5);
23817 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23818 }
23819 }
23820
23821 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23822 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23823 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23824 Res, Mask);
23825 return DAG.getBitcast(VT, Res);
23826}
23827
23828static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23829 SelectionDAG &DAG) {
23830 MVT VT = Op.getSimpleValueType();
23831
23832 if (Subtarget.hasXOP() && !VT.is512BitVector())
23833 return LowerBITREVERSE_XOP(Op, DAG);
23834
23835 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23835, __extension__ __PRETTY_FUNCTION__))
;
23836
23837 SDValue In = Op.getOperand(0);
23838 SDLoc DL(Op);
23839
23840 unsigned NumElts = VT.getVectorNumElements();
23841 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __extension__ __PRETTY_FUNCTION__))
23842 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23842, __extension__ __PRETTY_FUNCTION__))
;
23843
23844 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23845 if (VT.is256BitVector() && !Subtarget.hasInt256())
23846 return Lower256IntUnary(Op, DAG);
23847
23848 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23849 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23850 // 0-15 value (moved to the other nibble).
23851 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23852 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23853 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23854
23855 const int LoLUT[16] = {
23856 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23857 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23858 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23859 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23860 const int HiLUT[16] = {
23861 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23862 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23863 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23864 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23865
23866 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23867 for (unsigned i = 0; i < NumElts; ++i) {
23868 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23869 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23870 }
23871
23872 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23873 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23874 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23875 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23876 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23877}
23878
23879static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
23880 const X86Subtarget &Subtarget,
23881 bool AllowIncDec = true) {
23882 unsigned NewOpc = 0;
23883 switch (N->getOpcode()) {
23884 case ISD::ATOMIC_LOAD_ADD:
23885 NewOpc = X86ISD::LADD;
23886 break;
23887 case ISD::ATOMIC_LOAD_SUB:
23888 NewOpc = X86ISD::LSUB;
23889 break;
23890 case ISD::ATOMIC_LOAD_OR:
23891 NewOpc = X86ISD::LOR;
23892 break;
23893 case ISD::ATOMIC_LOAD_XOR:
23894 NewOpc = X86ISD::LXOR;
23895 break;
23896 case ISD::ATOMIC_LOAD_AND:
23897 NewOpc = X86ISD::LAND;
23898 break;
23899 default:
23900 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23900)
;
23901 }
23902
23903 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23904
23905 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
23906 // Convert to inc/dec if they aren't slow or we are optimizing for size.
23907 if (AllowIncDec && (!Subtarget.slowIncDec() ||
23908 DAG.getMachineFunction().getFunction()->optForSize())) {
23909 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
23910 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
23911 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
23912 DAG.getVTList(MVT::i32, MVT::Other),
23913 {N->getOperand(0), N->getOperand(1)},
23914 /*MemVT=*/N->getSimpleValueType(0), MMO);
23915 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
23916 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
23917 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
23918 DAG.getVTList(MVT::i32, MVT::Other),
23919 {N->getOperand(0), N->getOperand(1)},
23920 /*MemVT=*/N->getSimpleValueType(0), MMO);
23921 }
23922 }
23923
23924 return DAG.getMemIntrinsicNode(
23925 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23926 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23927 /*MemVT=*/N->getSimpleValueType(0), MMO);
23928}
23929
23930/// Lower atomic_load_ops into LOCK-prefixed operations.
23931static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23932 const X86Subtarget &Subtarget) {
23933 SDValue Chain = N->getOperand(0);
23934 SDValue LHS = N->getOperand(1);
23935 SDValue RHS = N->getOperand(2);
23936 unsigned Opc = N->getOpcode();
23937 MVT VT = N->getSimpleValueType(0);
23938 SDLoc DL(N);
23939
23940 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23941 // can only be lowered when the result is unused. They should have already
23942 // been transformed into a cmpxchg loop in AtomicExpand.
23943 if (N->hasAnyUseOfValue(0)) {
23944 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23945 // select LXADD if LOCK_SUB can't be selected.
23946 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23947 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23948 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23949 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23950 RHS, AN->getMemOperand());
23951 }
23952 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23953, __extension__ __PRETTY_FUNCTION__))
23953 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23953, __extension__ __PRETTY_FUNCTION__))
;
23954 return N;
23955 }
23956
23957 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
23958 // RAUW the chain, but don't worry about the result, as it's unused.
23959 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 23959, __extension__ __PRETTY_FUNCTION__))
;
23960 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23961 return SDValue();
23962}
23963
23964static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23965 SDNode *Node = Op.getNode();
23966 SDLoc dl(Node);
23967 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23968
23969 // Convert seq_cst store -> xchg
23970 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23971 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23972 // (The only way to get a 16-byte store is cmpxchg16b)
23973 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23974 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23975 AtomicOrdering::SequentiallyConsistent ||
23976 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23977 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23978 cast<AtomicSDNode>(Node)->getMemoryVT(),
23979 Node->getOperand(0),
23980 Node->getOperand(1), Node->getOperand(2),
23981 cast<AtomicSDNode>(Node)->getMemOperand());
23982 return Swap.getValue(1);
23983 }
23984 // Other atomic stores have a simple pattern.
23985 return Op;
23986}
23987
23988static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23989 SDNode *N = Op.getNode();
23990 MVT VT = N->getSimpleValueType(0);
23991
23992 // Let legalize expand this if it isn't a legal type yet.
23993 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23994 return SDValue();
23995
23996 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23997 SDLoc DL(N);
23998
23999 // Set the carry flag.
24000 SDValue Carry = Op.getOperand(2);
24001 EVT CarryVT = Carry.getValueType();
24002 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24003 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24004 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24005
24006 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24007 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24008 Op.getOperand(1), Carry.getValue(1));
24009
24010 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24011 if (N->getValueType(1) == MVT::i1)
24012 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24013
24014 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24015}
24016
24017static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24018 SelectionDAG &DAG) {
24019 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24019, __extension__ __PRETTY_FUNCTION__))
;
24020
24021 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24022 // which returns the values as { float, float } (in XMM0) or
24023 // { double, double } (which is returned in XMM0, XMM1).
24024 SDLoc dl(Op);
24025 SDValue Arg = Op.getOperand(0);
24026 EVT ArgVT = Arg.getValueType();
24027 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24028
24029 TargetLowering::ArgListTy Args;
24030 TargetLowering::ArgListEntry Entry;
24031
24032 Entry.Node = Arg;
24033 Entry.Ty = ArgTy;
24034 Entry.IsSExt = false;
24035 Entry.IsZExt = false;
24036 Args.push_back(Entry);
24037
24038 bool isF64 = ArgVT == MVT::f64;
24039 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24040 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24041 // the results are returned via SRet in memory.
24042 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
24043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24044 SDValue Callee =
24045 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24046
24047 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24048 : (Type *)VectorType::get(ArgTy, 4);
24049
24050 TargetLowering::CallLoweringInfo CLI(DAG);
24051 CLI.setDebugLoc(dl)
24052 .setChain(DAG.getEntryNode())
24053 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24054
24055 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24056
24057 if (isF64)
24058 // Returned in xmm0 and xmm1.
24059 return CallResult.first;
24060
24061 // Returned in bits 0:31 and 32:64 xmm0.
24062 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24063 CallResult.first, DAG.getIntPtrConstant(0, dl));
24064 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24065 CallResult.first, DAG.getIntPtrConstant(1, dl));
24066 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24067 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24068}
24069
24070/// Widen a vector input to a vector of NVT. The
24071/// input vector must have the same element type as NVT.
24072static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24073 bool FillWithZeroes = false) {
24074 // Check if InOp already has the right width.
24075 MVT InVT = InOp.getSimpleValueType();
24076 if (InVT == NVT)
24077 return InOp;
24078
24079 if (InOp.isUndef())
24080 return DAG.getUNDEF(NVT);
24081
24082 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24083, __extension__ __PRETTY_FUNCTION__))
24083 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24083, __extension__ __PRETTY_FUNCTION__))
;
24084
24085 unsigned InNumElts = InVT.getVectorNumElements();
24086 unsigned WidenNumElts = NVT.getVectorNumElements();
24087 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24088, __extension__ __PRETTY_FUNCTION__))
24088 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24088, __extension__ __PRETTY_FUNCTION__))
;
24089
24090 SDLoc dl(InOp);
24091 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24092 InOp.getNumOperands() == 2) {
24093 SDValue N1 = InOp.getOperand(1);
24094 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24095 N1.isUndef()) {
24096 InOp = InOp.getOperand(0);
24097 InVT = InOp.getSimpleValueType();
24098 InNumElts = InVT.getVectorNumElements();
24099 }
24100 }
24101 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24102 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24103 SmallVector<SDValue, 16> Ops;
24104 for (unsigned i = 0; i < InNumElts; ++i)
24105 Ops.push_back(InOp.getOperand(i));
24106
24107 EVT EltVT = InOp.getOperand(0).getValueType();
24108
24109 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24110 DAG.getUNDEF(EltVT);
24111 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24112 Ops.push_back(FillVal);
24113 return DAG.getBuildVector(NVT, dl, Ops);
24114 }
24115 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24116 DAG.getUNDEF(NVT);
24117 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24118 InOp, DAG.getIntPtrConstant(0, dl));
24119}
24120
24121static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24122 SelectionDAG &DAG) {
24123 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24124, __extension__ __PRETTY_FUNCTION__))
24124 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24124, __extension__ __PRETTY_FUNCTION__))
;
24125
24126 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24127 SDValue Src = N->getValue();
24128 MVT VT = Src.getSimpleValueType();
24129 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24129, __extension__ __PRETTY_FUNCTION__))
;
24130 SDLoc dl(Op);
24131
24132 SDValue Index = N->getIndex();
24133 SDValue Mask = N->getMask();
24134 SDValue Chain = N->getChain();
24135 SDValue BasePtr = N->getBasePtr();
24136 MVT MemVT = N->getMemoryVT().getSimpleVT();
24137 MVT IndexVT = Index.getSimpleValueType();
24138 MVT MaskVT = Mask.getSimpleValueType();
24139
24140 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
24141 // The v2i32 value was promoted to v2i64.
24142 // Now we "redo" the type legalizer's work and widen the original
24143 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
24144 // with a shuffle.
24145 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&(static_cast <bool> ((MemVT == MVT::v2i32 && VT
== MVT::v2i64) && "Unexpected memory type") ? void (
0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24146, __extension__ __PRETTY_FUNCTION__))
24146 "Unexpected memory type")(static_cast <bool> ((MemVT == MVT::v2i32 && VT
== MVT::v2i64) && "Unexpected memory type") ? void (
0) : __assert_fail ("(MemVT == MVT::v2i32 && VT == MVT::v2i64) && \"Unexpected memory type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24146, __extension__ __PRETTY_FUNCTION__))
;
24147 int ShuffleMask[] = {0, 2, -1, -1};
24148 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
24149 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
24150 // Now we have 4 elements instead of 2.
24151 // Expand the index.
24152 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
24153 Index = ExtendToType(Index, NewIndexVT, DAG);
24154
24155 // Expand the mask with zeroes
24156 // Mask may be <2 x i64> or <2 x i1> at this moment
24157 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&(static_cast <bool> ((MaskVT == MVT::v2i1 || MaskVT == MVT
::v2i64) && "Unexpected mask type") ? void (0) : __assert_fail
("(MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24158, __extension__ __PRETTY_FUNCTION__))
24158 "Unexpected mask type")(static_cast <bool> ((MaskVT == MVT::v2i1 || MaskVT == MVT
::v2i64) && "Unexpected mask type") ? void (0) : __assert_fail
("(MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24158, __extension__ __PRETTY_FUNCTION__))
;
24159 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
24160 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24161 VT = MVT::v4i32;
24162 }
24163
24164 unsigned NumElts = VT.getVectorNumElements();
24165 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24166 !Index.getSimpleValueType().is512BitVector()) {
24167 // AVX512F supports only 512-bit vectors. Or data or index should
24168 // be 512 bit wide. If now the both index and data are 256-bit, but
24169 // the vector contains 8 elements, we just sign-extend the index
24170 if (IndexVT == MVT::v8i32)
24171 // Just extend index
24172 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24173 else {
24174 // The minimal number of elts in scatter is 8
24175 NumElts = 8;
24176 // Index
24177 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24178 // Use original index here, do not modify the index twice
24179 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
24180 if (IndexVT.getScalarType() == MVT::i32)
24181 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24182
24183 // Mask
24184 // At this point we have promoted mask operand
24185 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type")(static_cast <bool> (MaskVT.getScalarSizeInBits() >=
32 && "unexpected mask type") ? void (0) : __assert_fail
("MaskVT.getScalarSizeInBits() >= 32 && \"unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24185, __extension__ __PRETTY_FUNCTION__))
;
24186 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24187 // Use the original mask here, do not modify the mask twice
24188 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
24189
24190 // The value that should be stored
24191 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24192 Src = ExtendToType(Src, NewVT, DAG);
24193 }
24194 }
24195 // If the mask is "wide" at this point - truncate it to i1 vector
24196 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24197 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
24198
24199 // The mask is killed by scatter, add it to the values
24200 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
24201 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
24202 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24203 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24204 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24205 return SDValue(NewScatter.getNode(), 1);
24206}
24207
24208static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24209 SelectionDAG &DAG) {
24210
24211 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24212 MVT VT = Op.getSimpleValueType();
24213 MVT ScalarVT = VT.getScalarType();
24214 SDValue Mask = N->getMask();
24215 SDLoc dl(Op);
24216
24217 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24218, __extension__ __PRETTY_FUNCTION__))
24218 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24218, __extension__ __PRETTY_FUNCTION__))
;
24219
24220 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24221, __extension__ __PRETTY_FUNCTION__))
24221 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24221, __extension__ __PRETTY_FUNCTION__))
;
24222
24223 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
24224 // VLX. These types for exp-loads are handled here.
24225 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
24226 return Op;
24227
24228 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24229, __extension__ __PRETTY_FUNCTION__))
24229 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24229, __extension__ __PRETTY_FUNCTION__))
;
24230
24231 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))
24232 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))
24233 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))
24234 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24234, __extension__ __PRETTY_FUNCTION__))
;
24235
24236 // This operation is legal for targets with VLX, but without
24237 // VLX the vector should be widened to 512 bit
24238 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24239 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24240 SDValue Src0 = N->getSrc0();
24241 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24242
24243 // Mask element has to be i1.
24244 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24245 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24246, __extension__ __PRETTY_FUNCTION__))
24246 "We handle 4x32, 4x64 and 2x64 vectors only in this case")(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24246, __extension__ __PRETTY_FUNCTION__))
;
24247
24248 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24249
24250 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24251 if (MaskEltTy != MVT::i1)
24252 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24253 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24254 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24255 N->getBasePtr(), Mask, Src0,
24256 N->getMemoryVT(), N->getMemOperand(),
24257 N->getExtensionType(),
24258 N->isExpandingLoad());
24259
24260 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24261 NewLoad.getValue(0),
24262 DAG.getIntPtrConstant(0, dl));
24263 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24264 return DAG.getMergeValues(RetOps, dl);
24265}
24266
24267static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24268 SelectionDAG &DAG) {
24269 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24270 SDValue DataToStore = N->getValue();
24271 MVT VT = DataToStore.getSimpleValueType();
24272 MVT ScalarVT = VT.getScalarType();
24273 SDValue Mask = N->getMask();
24274 SDLoc dl(Op);
24275
24276 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24277, __extension__ __PRETTY_FUNCTION__))
24277 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24277, __extension__ __PRETTY_FUNCTION__))
;
24278
24279 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24280, __extension__ __PRETTY_FUNCTION__))
24280 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24280, __extension__ __PRETTY_FUNCTION__))
;
24281
24282 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
24283 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
24284 return Op;
24285
24286 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24287, __extension__ __PRETTY_FUNCTION__))
24287 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24287, __extension__ __PRETTY_FUNCTION__))
;
24288
24289 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))
24290 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))
24291 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))
24292 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24292, __extension__ __PRETTY_FUNCTION__))
;
24293
24294 // This operation is legal for targets with VLX, but without
24295 // VLX the vector should be widened to 512 bit
24296 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24297 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24298
24299 // Mask element has to be i1.
24300 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24301 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24302, __extension__ __PRETTY_FUNCTION__))
24302 "We handle 4x32, 4x64 and 2x64 vectors only in this case")(static_cast <bool> ((MaskEltTy == MVT::i1 || VT.getVectorNumElements
() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"
) ? void (0) : __assert_fail ("(MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && \"We handle 4x32, 4x64 and 2x64 vectors only in this case\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24302, __extension__ __PRETTY_FUNCTION__))
;
24303
24304 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24305
24306 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24307 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24308 if (MaskEltTy != MVT::i1)
24309 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24310 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24311 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24312 Mask, N->getMemoryVT(), N->getMemOperand(),
24313 N->isTruncatingStore(), N->isCompressingStore());
24314}
24315
24316static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24317 SelectionDAG &DAG) {
24318 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24319, __extension__ __PRETTY_FUNCTION__))
24319 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24319, __extension__ __PRETTY_FUNCTION__))
;
24320
24321 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24322 SDLoc dl(Op);
24323 MVT VT = Op.getSimpleValueType();
24324 SDValue Index = N->getIndex();
24325 SDValue Mask = N->getMask();
24326 SDValue Src0 = N->getValue();
24327 MVT IndexVT = Index.getSimpleValueType();
24328 MVT MaskVT = Mask.getSimpleValueType();
24329
24330 unsigned NumElts = VT.getVectorNumElements();
24331 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24331, __extension__ __PRETTY_FUNCTION__))
;
24332
24333 // If the index is v2i32, we're being called by type legalization.
24334 if (IndexVT == MVT::v2i32)
24335 return SDValue();
24336
24337 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24338 !Index.getSimpleValueType().is512BitVector()) {
24339 // AVX512F supports only 512-bit vectors. Or data or index should
24340 // be 512 bit wide. If now the both index and data are 256-bit, but
24341 // the vector contains 8 elements, we just sign-extend the index
24342 if (NumElts == 8) {
24343 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24344 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24345 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24346 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24347 N->getMemOperand());
24348 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24349 }
24350
24351 // Minimal number of elements in Gather
24352 NumElts = 8;
24353 // Index
24354 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24355 Index = ExtendToType(Index, NewIndexVT, DAG);
24356 if (IndexVT.getScalarType() == MVT::i32)
24357 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24358
24359 // Mask
24360 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
24361 // At this point we have promoted mask operand
24362 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type")(static_cast <bool> (MaskVT.getScalarSizeInBits() >=
32 && "unexpected mask type") ? void (0) : __assert_fail
("MaskVT.getScalarSizeInBits() >= 32 && \"unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24362, __extension__ __PRETTY_FUNCTION__))
;
24363 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24364 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24365 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
24366
24367 // The pass-through value
24368 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24369 Src0 = ExtendToType(Src0, NewVT, DAG);
24370
24371 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24372 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24373 DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24374 N->getMemOperand());
24375 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24376 NewGather.getValue(0),
24377 DAG.getIntPtrConstant(0, dl));
24378 SDValue RetOps[] = {Extract, NewGather.getValue(2)};
24379 return DAG.getMergeValues(RetOps, dl);
24380 }
24381
24382 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24383 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24384 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24385 N->getMemOperand());
24386 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24387}
24388
24389SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24390 SelectionDAG &DAG) const {
24391 // TODO: Eventually, the lowering of these nodes should be informed by or
24392 // deferred to the GC strategy for the function in which they appear. For
24393 // now, however, they must be lowered to something. Since they are logically
24394 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24395 // require special handling for these nodes), lower them as literal NOOPs for
24396 // the time being.
24397 SmallVector<SDValue, 2> Ops;
24398
24399 Ops.push_back(Op.getOperand(0));
24400 if (Op->getGluedNode())
24401 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24402
24403 SDLoc OpDL(Op);
24404 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24405 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24406
24407 return NOOP;
24408}
24409
24410SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24411 SelectionDAG &DAG) const {
24412 // TODO: Eventually, the lowering of these nodes should be informed by or
24413 // deferred to the GC strategy for the function in which they appear. For
24414 // now, however, they must be lowered to something. Since they are logically
24415 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24416 // require special handling for these nodes), lower them as literal NOOPs for
24417 // the time being.
24418 SmallVector<SDValue, 2> Ops;
24419
24420 Ops.push_back(Op.getOperand(0));
24421 if (Op->getGluedNode())
24422 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24423
24424 SDLoc OpDL(Op);
24425 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24426 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24427
24428 return NOOP;
24429}
24430
24431/// Provide custom lowering hooks for some operations.
24432SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24433 switch (Op.getOpcode()) {
24434 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24434)
;
24435 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24436 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24437 return LowerCMP_SWAP(Op, Subtarget, DAG);
24438 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24439 case ISD::ATOMIC_LOAD_ADD:
24440 case ISD::ATOMIC_LOAD_SUB:
24441 case ISD::ATOMIC_LOAD_OR:
24442 case ISD::ATOMIC_LOAD_XOR:
24443 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24444 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24445 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24446 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24447 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24448 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24449 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24450 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24451 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24452 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24453 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24454 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24455 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24456 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24457 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24458 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24459 case ISD::SHL_PARTS:
24460 case ISD::SRA_PARTS:
24461 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24464 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24465 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24466 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24467 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24468 case ISD::ZERO_EXTEND_VECTOR_INREG:
24469 case ISD::SIGN_EXTEND_VECTOR_INREG:
24470 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24471 case ISD::FP_TO_SINT:
24472 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24473 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24474 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24475 case ISD::FABS:
24476 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24477 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24478 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24479 case ISD::SETCC: return LowerSETCC(Op, DAG);
24480 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24481 case ISD::SELECT: return LowerSELECT(Op, DAG);
24482 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24483 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24484 case ISD::VASTART: return LowerVASTART(Op, DAG);
24485 case ISD::VAARG: return LowerVAARG(Op, DAG);
24486 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24487 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24488 case ISD::INTRINSIC_VOID:
24489 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24490 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24491 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24492 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24493 case ISD::FRAME_TO_ARGS_OFFSET:
24494 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24495 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24496 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24497 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24498 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24499 case ISD::EH_SJLJ_SETUP_DISPATCH:
24500 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24501 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24502 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24503 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24504 case ISD::CTLZ:
24505 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24506 case ISD::CTTZ:
24507 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24508 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24509 case ISD::MULHS:
24510 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24511 case ISD::UMUL_LOHI:
24512 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24513 case ISD::ROTL:
24514 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24515 case ISD::SRA:
24516 case ISD::SRL:
24517 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24518 case ISD::SADDO:
24519 case ISD::UADDO:
24520 case ISD::SSUBO:
24521 case ISD::USUBO:
24522 case ISD::SMULO:
24523 case ISD::UMULO: return LowerXALUO(Op, DAG);
24524 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24525 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24526 case ISD::ADDCARRY:
24527 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24528 case ISD::ADD:
24529 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24530 case ISD::SMAX:
24531 case ISD::SMIN:
24532 case ISD::UMAX:
24533 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24534 case ISD::ABS: return LowerABS(Op, DAG);
24535 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24536 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24537 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24538 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24539 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24540 case ISD::GC_TRANSITION_START:
24541 return LowerGC_TRANSITION_START(Op, DAG);
24542 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24543 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24544 }
24545}
24546
24547/// Places new result values for the node in Results (their number
24548/// and types must exactly match those of the original return values of
24549/// the node), or leaves Results empty, which indicates that the node is not
24550/// to be custom lowered after all.
24551void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24552 SmallVectorImpl<SDValue> &Results,
24553 SelectionDAG &DAG) const {
24554 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24555
24556 if (!Res.getNode())
24557 return;
24558
24559 assert((N->getNumValues() <= Res->getNumValues()) &&(static_cast <bool> ((N->getNumValues() <= Res->
getNumValues()) && "Lowering returned the wrong number of results!"
) ? void (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24560, __extension__ __PRETTY_FUNCTION__))
24560 "Lowering returned the wrong number of results!")(static_cast <bool> ((N->getNumValues() <= Res->
getNumValues()) && "Lowering returned the wrong number of results!"
) ? void (0) : __assert_fail ("(N->getNumValues() <= Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24560, __extension__ __PRETTY_FUNCTION__))
;
24561
24562 // Places new result values base on N result number.
24563 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24564 // than original node, chain should be dropped(last value).
24565 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24566 Results.push_back(Res.getValue(I));
24567}
24568
24569/// Replace a node with an illegal result type with a new node built out of
24570/// custom code.
24571void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24572 SmallVectorImpl<SDValue>&Results,
24573 SelectionDAG &DAG) const {
24574 SDLoc dl(N);
24575 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24576 switch (N->getOpcode()) {
24577 default:
24578 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24578)
;
24579 case X86ISD::AVG: {
24580 // Legalize types for X86ISD::AVG by expanding vectors.
24581 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24581, __extension__ __PRETTY_FUNCTION__))
;
24582
24583 auto InVT = N->getValueType(0);
24584 auto InVTSize = InVT.getSizeInBits();
24585 const unsigned RegSize =
24586 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24587 assert((Subtarget.hasBWI() || RegSize < 512) &&(static_cast <bool> ((Subtarget.hasBWI() || RegSize <
512) && "512-bit vector requires AVX512BW") ? void (
0) : __assert_fail ("(Subtarget.hasBWI() || RegSize < 512) && \"512-bit vector requires AVX512BW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24588, __extension__ __PRETTY_FUNCTION__))
24588 "512-bit vector requires AVX512BW")(static_cast <bool> ((Subtarget.hasBWI() || RegSize <
512) && "512-bit vector requires AVX512BW") ? void (
0) : __assert_fail ("(Subtarget.hasBWI() || RegSize < 512) && \"512-bit vector requires AVX512BW\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24588, __extension__ __PRETTY_FUNCTION__))
;
24589 assert((Subtarget.hasAVX2() || RegSize < 256) &&(static_cast <bool> ((Subtarget.hasAVX2() || RegSize <
256) && "256-bit vector requires AVX2") ? void (0) :
__assert_fail ("(Subtarget.hasAVX2() || RegSize < 256) && \"256-bit vector requires AVX2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24590, __extension__ __PRETTY_FUNCTION__))
24590 "256-bit vector requires AVX2")(static_cast <bool> ((Subtarget.hasAVX2() || RegSize <
256) && "256-bit vector requires AVX2") ? void (0) :
__assert_fail ("(Subtarget.hasAVX2() || RegSize < 256) && \"256-bit vector requires AVX2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24590, __extension__ __PRETTY_FUNCTION__))
;
24591
24592 auto ElemVT = InVT.getVectorElementType();
24593 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24594 RegSize / ElemVT.getSizeInBits());
24595 assert(RegSize % InVT.getSizeInBits() == 0)(static_cast <bool> (RegSize % InVT.getSizeInBits() == 0
) ? void (0) : __assert_fail ("RegSize % InVT.getSizeInBits() == 0"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24595, __extension__ __PRETTY_FUNCTION__))
;
24596 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24597
24598 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24599 Ops[0] = N->getOperand(0);
24600 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24601 Ops[0] = N->getOperand(1);
24602 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24603
24604 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24605 if (!ExperimentalVectorWideningLegalization)
24606 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24607 DAG.getIntPtrConstant(0, dl));
24608 Results.push_back(Res);
24609 return;
24610 }
24611 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24612 case X86ISD::FMINC:
24613 case X86ISD::FMIN:
24614 case X86ISD::FMAXC:
24615 case X86ISD::FMAX: {
24616 EVT VT = N->getValueType(0);
24617 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24617, __extension__ __PRETTY_FUNCTION__))
;
24618 SDValue UNDEF = DAG.getUNDEF(VT);
24619 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24620 N->getOperand(0), UNDEF);
24621 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24622 N->getOperand(1), UNDEF);
24623 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24624 return;
24625 }
24626 case ISD::SDIV:
24627 case ISD::UDIV:
24628 case ISD::SREM:
24629 case ISD::UREM:
24630 case ISD::SDIVREM:
24631 case ISD::UDIVREM: {
24632 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24633 Results.push_back(V);
24634 return;
24635 }
24636 case ISD::FP_TO_SINT:
24637 case ISD::FP_TO_UINT: {
24638 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24639
24640 if (N->getValueType(0) == MVT::v2i32) {
24641 assert((IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((IsSigned || Subtarget.hasAVX512()
) && "Can only handle signed conversion without AVX512"
) ? void (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24642, __extension__ __PRETTY_FUNCTION__))
24642 "Can only handle signed conversion without AVX512")(static_cast <bool> ((IsSigned || Subtarget.hasAVX512()
) && "Can only handle signed conversion without AVX512"
) ? void (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24642, __extension__ __PRETTY_FUNCTION__))
;
24643 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24643, __extension__ __PRETTY_FUNCTION__))
;
24644 SDValue Src = N->getOperand(0);
24645 if (Src.getValueType() == MVT::v2f64) {
24646 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24647 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24648 : X86ISD::CVTTP2UI,
24649 dl, MVT::v4i32, Src);
24650 if (!ExperimentalVectorWideningLegalization)
24651 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24652 Results.push_back(Res);
24653 return;
24654 }
24655 if (Src.getValueType() == MVT::v2f32) {
24656 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24657 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24658 DAG.getUNDEF(MVT::v2f32));
24659 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24660 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24661 if (!ExperimentalVectorWideningLegalization)
24662 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24663 Results.push_back(Res);
24664 return;
24665 }
24666
24667 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24668 // so early out here.
24669 return;
24670 }
24671
24672 std::pair<SDValue,SDValue> Vals =
24673 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24674 SDValue FIST = Vals.first, StackSlot = Vals.second;
24675 if (FIST.getNode()) {
24676 EVT VT = N->getValueType(0);
24677 // Return a load from the stack slot.
24678 if (StackSlot.getNode())
24679 Results.push_back(
24680 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24681 else
24682 Results.push_back(FIST);
24683 }
24684 return;
24685 }
24686 case ISD::SINT_TO_FP: {
24687 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL!") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24687, __extension__ __PRETTY_FUNCTION__))
;
24688 SDValue Src = N->getOperand(0);
24689 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24690 return;
24691 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24692 return;
24693 }
24694 case ISD::UINT_TO_FP: {
24695 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24695, __extension__ __PRETTY_FUNCTION__))
;
24696 EVT VT = N->getValueType(0);
24697 if (VT != MVT::v2f32)
24698 return;
24699 SDValue Src = N->getOperand(0);
24700 EVT SrcVT = Src.getValueType();
24701 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24702 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24703 return;
24704 }
24705 if (SrcVT != MVT::v2i32)
24706 return;
24707 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24708 SDValue VBias =
24709 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24710 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24711 DAG.getBitcast(MVT::v2i64, VBias));
24712 Or = DAG.getBitcast(MVT::v2f64, Or);
24713 // TODO: Are there any fast-math-flags to propagate here?
24714 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24715 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24716 return;
24717 }
24718 case ISD::FP_ROUND: {
24719 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24720 return;
24721 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24722 Results.push_back(V);
24723 return;
24724 }
24725 case ISD::FP_EXTEND: {
24726 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24727 // No other ValueType for FP_EXTEND should reach this point.
24728 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24729, __extension__ __PRETTY_FUNCTION__))
24729 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24729, __extension__ __PRETTY_FUNCTION__))
;
24730 return;
24731 }
24732 case ISD::INTRINSIC_W_CHAIN: {
24733 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24734 switch (IntNo) {
24735 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24736)
24736 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24736)
;
24737 case Intrinsic::x86_rdtsc:
24738 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24739 Results);
24740 case Intrinsic::x86_rdtscp:
24741 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24742 Results);
24743 case Intrinsic::x86_rdpmc:
24744 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24745
24746 case Intrinsic::x86_xgetbv:
24747 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24748 }
24749 }
24750 case ISD::INTRINSIC_WO_CHAIN: {
24751 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
24752 Results.push_back(V);
24753 return;
24754 }
24755 case ISD::READCYCLECOUNTER: {
24756 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24757 Results);
24758 }
24759 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24760 EVT T = N->getValueType(0);
24761 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24761, __extension__ __PRETTY_FUNCTION__))
;
24762 bool Regs64bit = T == MVT::i128;
24763 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24764 SDValue cpInL, cpInH;
24765 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24766 DAG.getConstant(0, dl, HalfT));
24767 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24768 DAG.getConstant(1, dl, HalfT));
24769 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24770 Regs64bit ? X86::RAX : X86::EAX,
24771 cpInL, SDValue());
24772 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24773 Regs64bit ? X86::RDX : X86::EDX,
24774 cpInH, cpInL.getValue(1));
24775 SDValue swapInL, swapInH;
24776 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24777 DAG.getConstant(0, dl, HalfT));
24778 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24779 DAG.getConstant(1, dl, HalfT));
24780 swapInH =
24781 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24782 swapInH, cpInH.getValue(1));
24783 // If the current function needs the base pointer, RBX,
24784 // we shouldn't use cmpxchg directly.
24785 // Indeed the lowering of that instruction will clobber
24786 // that register and since RBX will be a reserved register
24787 // the register allocator will not make sure its value will
24788 // be properly saved and restored around this live-range.
24789 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24790 SDValue Result;
24791 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24792 unsigned BasePtr = TRI->getBaseRegister();
24793 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24794 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24795 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24796 // ISel prefers the LCMPXCHG64 variant.
24797 // If that assert breaks, that means it is not the case anymore,
24798 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24799 // not just EBX. This is a matter of accepting i64 input for that
24800 // pseudo, and restoring into the register of the right wide
24801 // in expand pseudo. Everything else should just work.
24802 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&(static_cast <bool> (((Regs64bit == (BasePtr == X86::RBX
)) || BasePtr == X86::EBX) && "Saving only half of the RBX"
) ? void (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24803, __extension__ __PRETTY_FUNCTION__))
24803 "Saving only half of the RBX")(static_cast <bool> (((Regs64bit == (BasePtr == X86::RBX
)) || BasePtr == X86::EBX) && "Saving only half of the RBX"
) ? void (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24803, __extension__ __PRETTY_FUNCTION__))
;
24804 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24805 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24806 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24807 Regs64bit ? X86::RBX : X86::EBX,
24808 HalfT, swapInH.getValue(1));
24809 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24810 RBXSave,
24811 /*Glue*/ RBXSave.getValue(2)};
24812 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24813 } else {
24814 unsigned Opcode =
24815 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24816 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24817 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24818 swapInH.getValue(1));
24819 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24820 swapInL.getValue(1)};
24821 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24822 }
24823 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24824 Regs64bit ? X86::RAX : X86::EAX,
24825 HalfT, Result.getValue(1));
24826 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24827 Regs64bit ? X86::RDX : X86::EDX,
24828 HalfT, cpOutL.getValue(2));
24829 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24830
24831 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24832 MVT::i32, cpOutH.getValue(2));
24833 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24834 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24835
24836 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24837 Results.push_back(Success);
24838 Results.push_back(EFLAGS.getValue(1));
24839 return;
24840 }
24841 case ISD::ATOMIC_SWAP:
24842 case ISD::ATOMIC_LOAD_ADD:
24843 case ISD::ATOMIC_LOAD_SUB:
24844 case ISD::ATOMIC_LOAD_AND:
24845 case ISD::ATOMIC_LOAD_OR:
24846 case ISD::ATOMIC_LOAD_XOR:
24847 case ISD::ATOMIC_LOAD_NAND:
24848 case ISD::ATOMIC_LOAD_MIN:
24849 case ISD::ATOMIC_LOAD_MAX:
24850 case ISD::ATOMIC_LOAD_UMIN:
24851 case ISD::ATOMIC_LOAD_UMAX:
24852 case ISD::ATOMIC_LOAD: {
24853 // Delegate to generic TypeLegalization. Situations we can really handle
24854 // should have already been dealt with by AtomicExpandPass.cpp.
24855 break;
24856 }
24857 case ISD::BITCAST: {
24858 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24858, __extension__ __PRETTY_FUNCTION__))
;
24859 EVT DstVT = N->getValueType(0);
24860 EVT SrcVT = N->getOperand(0)->getValueType(0);
24861
24862 if (SrcVT != MVT::f64 ||
24863 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24864 return;
24865
24866 unsigned NumElts = DstVT.getVectorNumElements();
24867 EVT SVT = DstVT.getVectorElementType();
24868 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24869 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24870 MVT::v2f64, N->getOperand(0));
24871 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24872
24873 if (ExperimentalVectorWideningLegalization) {
24874 // If we are legalizing vectors by widening, we already have the desired
24875 // legal vector type, just return it.
24876 Results.push_back(ToVecInt);
24877 return;
24878 }
24879
24880 SmallVector<SDValue, 8> Elts;
24881 for (unsigned i = 0, e = NumElts; i != e; ++i)
24882 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24883 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24884
24885 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24886 return;
24887 }
24888 case ISD::MGATHER: {
24889 EVT VT = N->getValueType(0);
24890 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24891 auto *Gather = cast<MaskedGatherSDNode>(N);
24892 SDValue Index = Gather->getIndex();
24893 if (Index.getValueType() != MVT::v2i64)
24894 return;
24895 SDValue Mask = Gather->getMask();
24896 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24896, __extension__ __PRETTY_FUNCTION__))
;
24897 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24898 Gather->getValue(),
24899 DAG.getUNDEF(MVT::v2f32));
24900 if (!Subtarget.hasVLX()) {
24901 // We need to widen the mask, but the instruction will only use 2
24902 // of its elements. So we can use undef.
24903 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24904 DAG.getUNDEF(MVT::v2i1));
24905 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
24906 }
24907 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24908 Index };
24909 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24910 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
24911 Gather->getMemoryVT(), Gather->getMemOperand());
24912 Results.push_back(Res);
24913 Results.push_back(Res.getValue(2));
24914 return;
24915 }
24916 if (VT == MVT::v2i32) {
24917 auto *Gather = cast<MaskedGatherSDNode>(N);
24918 SDValue Index = Gather->getIndex();
24919 SDValue Mask = Gather->getMask();
24920 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 24920, __extension__ __PRETTY_FUNCTION__))
;
24921 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
24922 Gather->getValue(),
24923 DAG.getUNDEF(MVT::v2i32));
24924 // If the index is v2i64 we can use it directly.
24925 if (Index.getValueType() == MVT::v2i64 &&
24926 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24927 if (!Subtarget.hasVLX()) {
24928 // We need to widen the mask, but the instruction will only use 2
24929 // of its elements. So we can use undef.
24930 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24931 DAG.getUNDEF(MVT::v2i1));
24932 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
24933 }
24934 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24935 Index };
24936 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24937 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
24938 Gather->getMemoryVT(), Gather->getMemOperand());
24939 SDValue Chain = Res.getValue(2);
24940 if (!ExperimentalVectorWideningLegalization)
24941 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
24942 DAG.getIntPtrConstant(0, dl));
24943 Results.push_back(Res);
24944 Results.push_back(Chain);
24945 return;
24946 }
24947 EVT IndexVT = Index.getValueType();
24948 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
24949 IndexVT.getScalarType(), 4);
24950 // Otherwise we need to custom widen everything to avoid promotion.
24951 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
24952 DAG.getUNDEF(IndexVT));
24953 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24954 DAG.getConstant(0, dl, MVT::v2i1));
24955 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24956 Index };
24957 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
24958 Gather->getMemoryVT(), dl, Ops,
24959 Gather->getMemOperand());
24960 SDValue Chain = Res.getValue(1);
24961 if (!ExperimentalVectorWideningLegalization)
24962 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
24963 DAG.getIntPtrConstant(0, dl));
24964 Results.push_back(Res);
24965 Results.push_back(Chain);
24966 return;
24967 }
24968 break;
24969 }
24970 }
24971}
24972
24973const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24974 switch ((X86ISD::NodeType)Opcode) {
24975 case X86ISD::FIRST_NUMBER: break;
24976 case X86ISD::BSF: return "X86ISD::BSF";
24977 case X86ISD::BSR: return "X86ISD::BSR";
24978 case X86ISD::SHLD: return "X86ISD::SHLD";
24979 case X86ISD::SHRD: return "X86ISD::SHRD";
24980 case X86ISD::FAND: return "X86ISD::FAND";
24981 case X86ISD::FANDN: return "X86ISD::FANDN";
24982 case X86ISD::FOR: return "X86ISD::FOR";
24983 case X86ISD::FXOR: return "X86ISD::FXOR";
24984 case X86ISD::FILD: return "X86ISD::FILD";
24985 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24986 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24987 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24988 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24989 case X86ISD::FLD: return "X86ISD::FLD";
24990 case X86ISD::FST: return "X86ISD::FST";
24991 case X86ISD::CALL: return "X86ISD::CALL";
24992 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24993 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24994 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24995 case X86ISD::BT: return "X86ISD::BT";
24996 case X86ISD::CMP: return "X86ISD::CMP";
24997 case X86ISD::COMI: return "X86ISD::COMI";
24998 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24999 case X86ISD::CMPM: return "X86ISD::CMPM";
25000 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25001 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25002 case X86ISD::SETCC: return "X86ISD::SETCC";
25003 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25004 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25005 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25006 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25007 case X86ISD::CMOV: return "X86ISD::CMOV";
25008 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25009 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25010 case X86ISD::IRET: return "X86ISD::IRET";
25011 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25012 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25013 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25014 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25015 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25016 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25017 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25018 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25019 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25020 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25021 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25022 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25023 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25024 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25025 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25026 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25027 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25028 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25029 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25030 case X86ISD::HADD: return "X86ISD::HADD";
25031 case X86ISD::HSUB: return "X86ISD::HSUB";
25032 case X86ISD::FHADD: return "X86ISD::FHADD";
25033 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25034 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25035 case X86ISD::FMAX: return "X86ISD::FMAX";
25036 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25037 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25038 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25039 case X86ISD::FMIN: return "X86ISD::FMIN";
25040 case X86ISD::FMINS: return "X86ISD::FMINS";
25041 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25042 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25043 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25044 case X86ISD::FMINC: return "X86ISD::FMINC";
25045 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25046 case X86ISD::FRCP: return "X86ISD::FRCP";
25047 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25048 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25049 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25050 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25051 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25052 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25053 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25054 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25055 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25056 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25057 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25058 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25059 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25060 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25061 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25062 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25063 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25064 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25065 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25066 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25067 case X86ISD::LADD: return "X86ISD::LADD";
25068 case X86ISD::LSUB: return "X86ISD::LSUB";
25069 case X86ISD::LOR: return "X86ISD::LOR";
25070 case X86ISD::LXOR: return "X86ISD::LXOR";
25071 case X86ISD::LAND: return "X86ISD::LAND";
25072 case X86ISD::LINC: return "X86ISD::LINC";
25073 case X86ISD::LDEC: return "X86ISD::LDEC";
25074 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25075 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25076 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25077 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25078 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25079 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25080 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25081 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25082 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25083 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25084 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25085 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25086 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25087 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25088 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25089 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25090 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25091 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
25092 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25093 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25094 case X86ISD::VSHL: return "X86ISD::VSHL";
25095 case X86ISD::VSRL: return "X86ISD::VSRL";
25096 case X86ISD::VSRA: return "X86ISD::VSRA";
25097 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25098 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25099 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25100 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25101 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25102 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25103 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25104 case X86ISD::CMPP: return "X86ISD::CMPP";
25105 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25106 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25107 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
25108 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
25109 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25110 case X86ISD::ADD: return "X86ISD::ADD";
25111 case X86ISD::SUB: return "X86ISD::SUB";
25112 case X86ISD::ADC: return "X86ISD::ADC";
25113 case X86ISD::SBB: return "X86ISD::SBB";
25114 case X86ISD::SMUL: return "X86ISD::SMUL";
25115 case X86ISD::UMUL: return "X86ISD::UMUL";
25116 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25117 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25118 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25119 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25120 case X86ISD::INC: return "X86ISD::INC";
25121 case X86ISD::DEC: return "X86ISD::DEC";
25122 case X86ISD::OR: return "X86ISD::OR";
25123 case X86ISD::XOR: return "X86ISD::XOR";
25124 case X86ISD::AND: return "X86ISD::AND";
25125 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25126 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25127 case X86ISD::PTEST: return "X86ISD::PTEST";
25128 case X86ISD::TESTP: return "X86ISD::TESTP";
25129 case X86ISD::TESTM: return "X86ISD::TESTM";
25130 case X86ISD::TESTNM: return "X86ISD::TESTNM";
25131 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25132 case X86ISD::KTEST: return "X86ISD::KTEST";
25133 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25134 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25135 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25136 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25137 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25138 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25139 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25140 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25141 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25142 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25143 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25144 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25145 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25146 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25147 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25148 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25149 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25150 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25151 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25152 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25153 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25154 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25155 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25156 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25157 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25158 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25159 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25160 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25161 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25162 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
25163 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25164 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25165 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25166 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25167 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25168 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25169 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25170 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25171 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25172 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25173 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25174 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25175 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25176 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25177 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25178 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25179 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25180 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25181 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25182 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25183 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25184 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25185 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25186 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25187 case X86ISD::SAHF: return "X86ISD::SAHF";
25188 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25189 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25190 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25191 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25192 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25193 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25194 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25195 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25196 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25197 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25198 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25199 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25200 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25201 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25202 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25203 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25204 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25205 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25206 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25207 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25208 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25209 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25210 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25211 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25212 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25213 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25214 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25215 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25216 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25217 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25218 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25219 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25220 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25221 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25222 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25223 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25224 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25225 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25226 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25227 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25228 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25229 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25230 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25231 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25232 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25233 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25234 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25235 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25236 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25237 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25238 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25239 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25240 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25241 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25242 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25243 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25244 case X86ISD::XTEST: return "X86ISD::XTEST";
25245 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25246 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25247 case X86ISD::SELECT: return "X86ISD::SELECT";
25248 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25249 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25250 case X86ISD::RCP14: return "X86ISD::RCP14";
25251 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25252 case X86ISD::RCP28: return "X86ISD::RCP28";
25253 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25254 case X86ISD::EXP2: return "X86ISD::EXP2";
25255 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25256 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25257 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25258 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25259 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25260 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25261 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25262 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25263 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25264 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25265 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25266 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25267 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25268 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25269 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25270 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25271 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25272 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25273 case X86ISD::ADDS: return "X86ISD::ADDS";
25274 case X86ISD::SUBS: return "X86ISD::SUBS";
25275 case X86ISD::AVG: return "X86ISD::AVG";
25276 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25277 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25278 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25279 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25280 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25281 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25282 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25283 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25284 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25285 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25286 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25287 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25288 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25289 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25290 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25291 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25292 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25293 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25294 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25295 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25296 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25297 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25298 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25299 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25300 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25301 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25302 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25303 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25304 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25305 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25306 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25307 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25308 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25309 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25310 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25311 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25312 }
25313 return nullptr;
25314}
25315
25316/// Return true if the addressing mode represented by AM is legal for this
25317/// target, for a load/store of the specified type.
25318bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25319 const AddrMode &AM, Type *Ty,
25320 unsigned AS,
25321 Instruction *I) const {
25322 // X86 supports extremely general addressing modes.
25323 CodeModel::Model M = getTargetMachine().getCodeModel();
25324
25325 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25326 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25327 return false;
25328
25329 if (AM.BaseGV) {
25330 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25331
25332 // If a reference to this global requires an extra load, we can't fold it.
25333 if (isGlobalStubReference(GVFlags))
25334 return false;
25335
25336 // If BaseGV requires a register for the PIC base, we cannot also have a
25337 // BaseReg specified.
25338 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25339 return false;
25340
25341 // If lower 4G is not available, then we must use rip-relative addressing.
25342 if ((M != CodeModel::Small || isPositionIndependent()) &&
25343 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25344 return false;
25345 }
25346
25347 switch (AM.Scale) {
25348 case 0:
25349 case 1:
25350 case 2:
25351 case 4:
25352 case 8:
25353 // These scales always work.
25354 break;
25355 case 3:
25356 case 5:
25357 case 9:
25358 // These scales are formed with basereg+scalereg. Only accept if there is
25359 // no basereg yet.
25360 if (AM.HasBaseReg)
25361 return false;
25362 break;
25363 default: // Other stuff never works.
25364 return false;
25365 }
25366
25367 return true;
25368}
25369
25370bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25371 unsigned Bits = Ty->getScalarSizeInBits();
25372
25373 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25374 // particularly cheaper than those without.
25375 if (Bits == 8)
25376 return false;
25377
25378 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25379 // shifts just as cheap as scalar ones.
25380 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25381 return false;
25382
25383 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25384 // fully general vector.
25385 return true;
25386}
25387
25388bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25389 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25390 return false;
25391 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25392 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25393 return NumBits1 > NumBits2;
25394}
25395
25396bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25397 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25398 return false;
25399
25400 if (!isTypeLegal(EVT::getEVT(Ty1)))
25401 return false;
25402
25403 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25403, __extension__ __PRETTY_FUNCTION__))
;
25404
25405 // Assuming the caller doesn't have a zeroext or signext return parameter,
25406 // truncation all the way down to i1 is valid.
25407 return true;
25408}
25409
25410bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25411 return isInt<32>(Imm);
25412}
25413
25414bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25415 // Can also use sub to handle negated immediates.
25416 return isInt<32>(Imm);
25417}
25418
25419bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25420 if (!VT1.isInteger() || !VT2.isInteger())
25421 return false;
25422 unsigned NumBits1 = VT1.getSizeInBits();
25423 unsigned NumBits2 = VT2.getSizeInBits();
25424 return NumBits1 > NumBits2;
25425}
25426
25427bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25428 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25429 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25430}
25431
25432bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25433 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25434 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25435}
25436
25437bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25438 EVT VT1 = Val.getValueType();
25439 if (isZExtFree(VT1, VT2))
25440 return true;
25441
25442 if (Val.getOpcode() != ISD::LOAD)
25443 return false;
25444
25445 if (!VT1.isSimple() || !VT1.isInteger() ||
25446 !VT2.isSimple() || !VT2.isInteger())
25447 return false;
25448
25449 switch (VT1.getSimpleVT().SimpleTy) {
25450 default: break;
25451 case MVT::i8:
25452 case MVT::i16:
25453 case MVT::i32:
25454 // X86 has 8, 16, and 32-bit zero-extending loads.
25455 return true;
25456 }
25457
25458 return false;
25459}
25460
25461bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
25462
25463bool
25464X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25465 if (!Subtarget.hasAnyFMA())
25466 return false;
25467
25468 VT = VT.getScalarType();
25469
25470 if (!VT.isSimple())
25471 return false;
25472
25473 switch (VT.getSimpleVT().SimpleTy) {
25474 case MVT::f32:
25475 case MVT::f64:
25476 return true;
25477 default:
25478 break;
25479 }
25480
25481 return false;
25482}
25483
25484bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25485 // i16 instructions are longer (0x66 prefix) and potentially slower.
25486 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25487}
25488
25489/// Targets can use this to indicate that they only support *some*
25490/// VECTOR_SHUFFLE operations, those with specific masks.
25491/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25492/// are assumed to be legal.
25493bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25494 if (!VT.isSimple())
25495 return false;
25496
25497 // Not for i1 vectors
25498 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25499 return false;
25500
25501 // Very little shuffling can be done for 64-bit vectors right now.
25502 if (VT.getSimpleVT().getSizeInBits() == 64)
25503 return false;
25504
25505 // We only care that the types being shuffled are legal. The lowering can
25506 // handle any possible shuffle mask that results.
25507 return isTypeLegal(VT.getSimpleVT());
25508}
25509
25510bool
25511X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25512 EVT VT) const {
25513 // Just delegate to the generic legality, clear masks aren't special.
25514 return isShuffleMaskLegal(Mask, VT);
25515}
25516
25517//===----------------------------------------------------------------------===//
25518// X86 Scheduler Hooks
25519//===----------------------------------------------------------------------===//
25520
25521/// Utility function to emit xbegin specifying the start of an RTM region.
25522static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25523 const TargetInstrInfo *TII) {
25524 DebugLoc DL = MI.getDebugLoc();
25525
25526 const BasicBlock *BB = MBB->getBasicBlock();
25527 MachineFunction::iterator I = ++MBB->getIterator();
25528
25529 // For the v = xbegin(), we generate
25530 //
25531 // thisMBB:
25532 // xbegin sinkMBB
25533 //
25534 // mainMBB:
25535 // s0 = -1
25536 //
25537 // fallBB:
25538 // eax = # XABORT_DEF
25539 // s1 = eax
25540 //
25541 // sinkMBB:
25542 // v = phi(s0/mainBB, s1/fallBB)
25543
25544 MachineBasicBlock *thisMBB = MBB;
25545 MachineFunction *MF = MBB->getParent();
25546 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25547 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25548 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25549 MF->insert(I, mainMBB);
25550 MF->insert(I, fallMBB);
25551 MF->insert(I, sinkMBB);
25552
25553 // Transfer the remainder of BB and its successor edges to sinkMBB.
25554 sinkMBB->splice(sinkMBB->begin(), MBB,
25555 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25556 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25557
25558 MachineRegisterInfo &MRI = MF->getRegInfo();
25559 unsigned DstReg = MI.getOperand(0).getReg();
25560 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25561 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25562 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25563
25564 // thisMBB:
25565 // xbegin fallMBB
25566 // # fallthrough to mainMBB
25567 // # abortion to fallMBB
25568 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25569 thisMBB->addSuccessor(mainMBB);
25570 thisMBB->addSuccessor(fallMBB);
25571
25572 // mainMBB:
25573 // mainDstReg := -1
25574 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25575 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25576 mainMBB->addSuccessor(sinkMBB);
25577
25578 // fallMBB:
25579 // ; pseudo instruction to model hardware's definition from XABORT
25580 // EAX := XABORT_DEF
25581 // fallDstReg := EAX
25582 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25583 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25584 .addReg(X86::EAX);
25585 fallMBB->addSuccessor(sinkMBB);
25586
25587 // sinkMBB:
25588 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25589 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25590 .addReg(mainDstReg).addMBB(mainMBB)
25591 .addReg(fallDstReg).addMBB(fallMBB);
25592
25593 MI.eraseFromParent();
25594 return sinkMBB;
25595}
25596
25597// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25598// or XMM0_V32I8 in AVX all of this code can be replaced with that
25599// in the .td file.
25600static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25601 const TargetInstrInfo *TII) {
25602 unsigned Opc;
25603 switch (MI.getOpcode()) {
25604 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25604)
;
25605 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25606 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25607 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25608 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25609 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25610 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25611 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25612 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25613 }
25614
25615 DebugLoc dl = MI.getDebugLoc();
25616 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25617
25618 unsigned NumArgs = MI.getNumOperands();
25619 for (unsigned i = 1; i < NumArgs; ++i) {
25620 MachineOperand &Op = MI.getOperand(i);
25621 if (!(Op.isReg() && Op.isImplicit()))
25622 MIB.add(Op);
25623 }
25624 if (MI.hasOneMemOperand())
25625 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25626
25627 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25628 .addReg(X86::XMM0);
25629
25630 MI.eraseFromParent();
25631 return BB;
25632}
25633
25634// FIXME: Custom handling because TableGen doesn't support multiple implicit
25635// defs in an instruction pattern
25636static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25637 const TargetInstrInfo *TII) {
25638 unsigned Opc;
25639 switch (MI.getOpcode()) {
25640 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25640)
;
25641 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25642 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25643 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25644 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25645 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25646 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25647 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25648 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25649 }
25650
25651 DebugLoc dl = MI.getDebugLoc();
25652 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25653
25654 unsigned NumArgs = MI.getNumOperands(); // remove the results
25655 for (unsigned i = 1; i < NumArgs; ++i) {
25656 MachineOperand &Op = MI.getOperand(i);
25657 if (!(Op.isReg() && Op.isImplicit()))
25658 MIB.add(Op);
25659 }
25660 if (MI.hasOneMemOperand())
25661 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25662
25663 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25664 .addReg(X86::ECX);
25665
25666 MI.eraseFromParent();
25667 return BB;
25668}
25669
25670static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25671 const X86Subtarget &Subtarget) {
25672 DebugLoc dl = MI.getDebugLoc();
25673 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25674
25675 // insert input VAL into EAX
25676 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25677 .addReg(MI.getOperand(0).getReg());
25678 // insert zero to ECX
25679 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25680
25681 // insert zero to EDX
25682 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25683
25684 // insert WRPKRU instruction
25685 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25686
25687 MI.eraseFromParent(); // The pseudo is gone now.
25688 return BB;
25689}
25690
25691static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25692 const X86Subtarget &Subtarget) {
25693 DebugLoc dl = MI.getDebugLoc();
25694 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25695
25696 // insert zero to ECX
25697 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25698
25699 // insert RDPKRU instruction
25700 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25701 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25702 .addReg(X86::EAX);
25703
25704 MI.eraseFromParent(); // The pseudo is gone now.
25705 return BB;
25706}
25707
25708static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25709 const X86Subtarget &Subtarget,
25710 unsigned Opc) {
25711 DebugLoc dl = MI.getDebugLoc();
25712 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25713 // Address into RAX/EAX, other two args into ECX, EDX.
25714 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25715 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25716 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25717 for (int i = 0; i < X86::AddrNumOperands; ++i)
25718 MIB.add(MI.getOperand(i));
25719
25720 unsigned ValOps = X86::AddrNumOperands;
25721 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25722 .addReg(MI.getOperand(ValOps).getReg());
25723 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25724 .addReg(MI.getOperand(ValOps + 1).getReg());
25725
25726 // The instruction doesn't actually take any operands though.
25727 BuildMI(*BB, MI, dl, TII->get(Opc));
25728
25729 MI.eraseFromParent(); // The pseudo is gone now.
25730 return BB;
25731}
25732
25733static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25734 const X86Subtarget &Subtarget) {
25735 DebugLoc dl = MI->getDebugLoc();
25736 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25737 // Address into RAX/EAX
25738 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25739 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25740 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25741 for (int i = 0; i < X86::AddrNumOperands; ++i)
25742 MIB.add(MI->getOperand(i));
25743
25744 // The instruction doesn't actually take any operands though.
25745 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25746
25747 MI->eraseFromParent(); // The pseudo is gone now.
25748 return BB;
25749}
25750
25751
25752
25753MachineBasicBlock *
25754X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25755 MachineBasicBlock *MBB) const {
25756 // Emit va_arg instruction on X86-64.
25757
25758 // Operands to this pseudo-instruction:
25759 // 0 ) Output : destination address (reg)
25760 // 1-5) Input : va_list address (addr, i64mem)
25761 // 6 ) ArgSize : Size (in bytes) of vararg type
25762 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25763 // 8 ) Align : Alignment of type
25764 // 9 ) EFLAGS (implicit-def)
25765
25766 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG_64 should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25766, __extension__ __PRETTY_FUNCTION__))
;
25767 static_assert(X86::AddrNumOperands == 5,
25768 "VAARG_64 assumes 5 address operands");
25769
25770 unsigned DestReg = MI.getOperand(0).getReg();
25771 MachineOperand &Base = MI.getOperand(1);
25772 MachineOperand &Scale = MI.getOperand(2);
25773 MachineOperand &Index = MI.getOperand(3);
25774 MachineOperand &Disp = MI.getOperand(4);
25775 MachineOperand &Segment = MI.getOperand(5);
25776 unsigned ArgSize = MI.getOperand(6).getImm();
25777 unsigned ArgMode = MI.getOperand(7).getImm();
25778 unsigned Align = MI.getOperand(8).getImm();
25779
25780 // Memory Reference
25781 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25781, __extension__ __PRETTY_FUNCTION__))
;
25782 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25783 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25784
25785 // Machine Information
25786 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25787 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25788 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25789 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25790 DebugLoc DL = MI.getDebugLoc();
25791
25792 // struct va_list {
25793 // i32 gp_offset
25794 // i32 fp_offset
25795 // i64 overflow_area (address)
25796 // i64 reg_save_area (address)
25797 // }
25798 // sizeof(va_list) = 24
25799 // alignment(va_list) = 8
25800
25801 unsigned TotalNumIntRegs = 6;
25802 unsigned TotalNumXMMRegs = 8;
25803 bool UseGPOffset = (ArgMode == 1);
25804 bool UseFPOffset = (ArgMode == 2);
25805 unsigned MaxOffset = TotalNumIntRegs * 8 +
25806 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25807
25808 /* Align ArgSize to a multiple of 8 */
25809 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25810 bool NeedsAlign = (Align > 8);
25811
25812 MachineBasicBlock *thisMBB = MBB;
25813 MachineBasicBlock *overflowMBB;
25814 MachineBasicBlock *offsetMBB;
25815 MachineBasicBlock *endMBB;
25816
25817 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25818 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25819 unsigned OffsetReg = 0;
25820
25821 if (!UseGPOffset && !UseFPOffset) {
25822 // If we only pull from the overflow region, we don't create a branch.
25823 // We don't need to alter control flow.
25824 OffsetDestReg = 0; // unused
25825 OverflowDestReg = DestReg;
25826
25827 offsetMBB = nullptr;
25828 overflowMBB = thisMBB;
25829 endMBB = thisMBB;
25830 } else {
25831 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25832 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25833 // If not, pull from overflow_area. (branch to overflowMBB)
25834 //
25835 // thisMBB
25836 // | .
25837 // | .
25838 // offsetMBB overflowMBB
25839 // | .
25840 // | .
25841 // endMBB
25842
25843 // Registers for the PHI in endMBB
25844 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25845 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25846
25847 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25848 MachineFunction *MF = MBB->getParent();
25849 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25850 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25851 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25852
25853 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25854
25855 // Insert the new basic blocks
25856 MF->insert(MBBIter, offsetMBB);
25857 MF->insert(MBBIter, overflowMBB);
25858 MF->insert(MBBIter, endMBB);
25859
25860 // Transfer the remainder of MBB and its successor edges to endMBB.
25861 endMBB->splice(endMBB->begin(), thisMBB,
25862 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25863 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25864
25865 // Make offsetMBB and overflowMBB successors of thisMBB
25866 thisMBB->addSuccessor(offsetMBB);
25867 thisMBB->addSuccessor(overflowMBB);
25868
25869 // endMBB is a successor of both offsetMBB and overflowMBB
25870 offsetMBB->addSuccessor(endMBB);
25871 overflowMBB->addSuccessor(endMBB);
25872
25873 // Load the offset value into a register
25874 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25875 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25876 .add(Base)
25877 .add(Scale)
25878 .add(Index)
25879 .addDisp(Disp, UseFPOffset ? 4 : 0)
25880 .add(Segment)
25881 .setMemRefs(MMOBegin, MMOEnd);
25882
25883 // Check if there is enough room left to pull this argument.
25884 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25885 .addReg(OffsetReg)
25886 .addImm(MaxOffset + 8 - ArgSizeA8);
25887
25888 // Branch to "overflowMBB" if offset >= max
25889 // Fall through to "offsetMBB" otherwise
25890 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25891 .addMBB(overflowMBB);
25892 }
25893
25894 // In offsetMBB, emit code to use the reg_save_area.
25895 if (offsetMBB) {
25896 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25896, __extension__ __PRETTY_FUNCTION__))
;
25897
25898 // Read the reg_save_area address.
25899 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25900 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25901 .add(Base)
25902 .add(Scale)
25903 .add(Index)
25904 .addDisp(Disp, 16)
25905 .add(Segment)
25906 .setMemRefs(MMOBegin, MMOEnd);
25907
25908 // Zero-extend the offset
25909 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25910 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25911 .addImm(0)
25912 .addReg(OffsetReg)
25913 .addImm(X86::sub_32bit);
25914
25915 // Add the offset to the reg_save_area to get the final address.
25916 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25917 .addReg(OffsetReg64)
25918 .addReg(RegSaveReg);
25919
25920 // Compute the offset for the next argument
25921 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25922 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25923 .addReg(OffsetReg)
25924 .addImm(UseFPOffset ? 16 : 8);
25925
25926 // Store it back into the va_list.
25927 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25928 .add(Base)
25929 .add(Scale)
25930 .add(Index)
25931 .addDisp(Disp, UseFPOffset ? 4 : 0)
25932 .add(Segment)
25933 .addReg(NextOffsetReg)
25934 .setMemRefs(MMOBegin, MMOEnd);
25935
25936 // Jump to endMBB
25937 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25938 .addMBB(endMBB);
25939 }
25940
25941 //
25942 // Emit code to use overflow area
25943 //
25944
25945 // Load the overflow_area address into a register.
25946 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25947 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25948 .add(Base)
25949 .add(Scale)
25950 .add(Index)
25951 .addDisp(Disp, 8)
25952 .add(Segment)
25953 .setMemRefs(MMOBegin, MMOEnd);
25954
25955 // If we need to align it, do so. Otherwise, just copy the address
25956 // to OverflowDestReg.
25957 if (NeedsAlign) {
25958 // Align the overflow address
25959 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")(static_cast <bool> (isPowerOf2_32(Align) && "Alignment must be a power of 2"
) ? void (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 25959, __extension__ __PRETTY_FUNCTION__))
;
25960 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25961
25962 // aligned_addr = (addr + (align-1)) & ~(align-1)
25963 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25964 .addReg(OverflowAddrReg)
25965 .addImm(Align-1);
25966
25967 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25968 .addReg(TmpReg)
25969 .addImm(~(uint64_t)(Align-1));
25970 } else {
25971 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25972 .addReg(OverflowAddrReg);
25973 }
25974
25975 // Compute the next overflow address after this argument.
25976 // (the overflow address should be kept 8-byte aligned)
25977 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25978 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25979 .addReg(OverflowDestReg)
25980 .addImm(ArgSizeA8);
25981
25982 // Store the new overflow address.
25983 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25984 .add(Base)
25985 .add(Scale)
25986 .add(Index)
25987 .addDisp(Disp, 8)
25988 .add(Segment)
25989 .addReg(NextAddrReg)
25990 .setMemRefs(MMOBegin, MMOEnd);
25991
25992 // If we branched, emit the PHI to the front of endMBB.
25993 if (offsetMBB) {
25994 BuildMI(*endMBB, endMBB->begin(), DL,
25995 TII->get(X86::PHI), DestReg)
25996 .addReg(OffsetDestReg).addMBB(offsetMBB)
25997 .addReg(OverflowDestReg).addMBB(overflowMBB);
25998 }
25999
26000 // Erase the pseudo instruction
26001 MI.eraseFromParent();
26002
26003 return endMBB;
26004}
26005
26006MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26007 MachineInstr &MI, MachineBasicBlock *MBB) const {
26008 // Emit code to save XMM registers to the stack. The ABI says that the
26009 // number of registers to save is given in %al, so it's theoretically
26010 // possible to do an indirect jump trick to avoid saving all of them,
26011 // however this code takes a simpler approach and just executes all
26012 // of the stores if %al is non-zero. It's less code, and it's probably
26013 // easier on the hardware branch predictor, and stores aren't all that
26014 // expensive anyway.
26015
26016 // Create the new basic blocks. One block contains all the XMM stores,
26017 // and one block is the final destination regardless of whether any
26018 // stores were performed.
26019 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26020 MachineFunction *F = MBB->getParent();
26021 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26022 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26023 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26024 F->insert(MBBIter, XMMSaveMBB);
26025 F->insert(MBBIter, EndMBB);
26026
26027 // Transfer the remainder of MBB and its successor edges to EndMBB.
26028 EndMBB->splice(EndMBB->begin(), MBB,
26029 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26030 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26031
26032 // The original block will now fall through to the XMM save block.
26033 MBB->addSuccessor(XMMSaveMBB);
26034 // The XMMSaveMBB will fall through to the end block.
26035 XMMSaveMBB->addSuccessor(EndMBB);
26036
26037 // Now add the instructions.
26038 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26039 DebugLoc DL = MI.getDebugLoc();
26040
26041 unsigned CountReg = MI.getOperand(0).getReg();
26042 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26043 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26044
26045 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
26046 // If %al is 0, branch around the XMM save block.
26047 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26048 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26049 MBB->addSuccessor(EndMBB);
26050 }
26051
26052 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26053 // that was just emitted, but clearly shouldn't be "saved".
26054 assert((MI.getNumOperands() <= 3 ||(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))
26055 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))
26056 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))
26057 "Expected last argument to be EFLAGS")(static_cast <bool> ((MI.getNumOperands() <= 3 || !MI
.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand
(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS") ? void (0) : __assert_fail
("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26057, __extension__ __PRETTY_FUNCTION__))
;
26058 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26059 // In the XMM save block, save all the XMM argument registers.
26060 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26061 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26062 MachineMemOperand *MMO = F->getMachineMemOperand(
26063 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26064 MachineMemOperand::MOStore,
26065 /*Size=*/16, /*Align=*/16);
26066 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26067 .addFrameIndex(RegSaveFrameIndex)
26068 .addImm(/*Scale=*/1)
26069 .addReg(/*IndexReg=*/0)
26070 .addImm(/*Disp=*/Offset)
26071 .addReg(/*Segment=*/0)
26072 .addReg(MI.getOperand(i).getReg())
26073 .addMemOperand(MMO);
26074 }
26075
26076 MI.eraseFromParent(); // The pseudo instruction is gone now.
26077
26078 return EndMBB;
26079}
26080
26081// The EFLAGS operand of SelectItr might be missing a kill marker
26082// because there were multiple uses of EFLAGS, and ISel didn't know
26083// which to mark. Figure out whether SelectItr should have had a
26084// kill marker, and set it if it should. Returns the correct kill
26085// marker value.
26086static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26087 MachineBasicBlock* BB,
26088 const TargetRegisterInfo* TRI) {
26089 // Scan forward through BB for a use/def of EFLAGS.
26090 MachineBasicBlock::iterator miI(std::next(SelectItr));
26091 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26092 const MachineInstr& mi = *miI;
26093 if (mi.readsRegister(X86::EFLAGS))
26094 return false;
26095 if (mi.definesRegister(X86::EFLAGS))
26096 break; // Should have kill-flag - update below.
26097 }
26098
26099 // If we hit the end of the block, check whether EFLAGS is live into a
26100 // successor.
26101 if (miI == BB->end()) {
26102 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26103 sEnd = BB->succ_end();
26104 sItr != sEnd; ++sItr) {
26105 MachineBasicBlock* succ = *sItr;
26106 if (succ->isLiveIn(X86::EFLAGS))
26107 return false;
26108 }
26109 }
26110
26111 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26112 // out. SelectMI should have a kill flag on EFLAGS.
26113 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26114 return true;
26115}
26116
26117// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26118// together with other CMOV pseudo-opcodes into a single basic-block with
26119// conditional jump around it.
26120static bool isCMOVPseudo(MachineInstr &MI) {
26121 switch (MI.getOpcode()) {
26122 case X86::CMOV_FR32:
26123 case X86::CMOV_FR64:
26124 case X86::CMOV_GR8:
26125 case X86::CMOV_GR16:
26126 case X86::CMOV_GR32:
26127 case X86::CMOV_RFP32:
26128 case X86::CMOV_RFP64:
26129 case X86::CMOV_RFP80:
26130 case X86::CMOV_V2F64:
26131 case X86::CMOV_V2I64:
26132 case X86::CMOV_V4F32:
26133 case X86::CMOV_V4F64:
26134 case X86::CMOV_V4I64:
26135 case X86::CMOV_V16F32:
26136 case X86::CMOV_V8F32:
26137 case X86::CMOV_V8F64:
26138 case X86::CMOV_V8I64:
26139 case X86::CMOV_V8I1:
26140 case X86::CMOV_V16I1:
26141 case X86::CMOV_V32I1:
26142 case X86::CMOV_V64I1:
26143 return true;
26144
26145 default:
26146 return false;
26147 }
26148}
26149
26150// Helper function, which inserts PHI functions into SinkMBB:
26151// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26152// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26153// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26154// the last PHI function inserted.
26155static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26156 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26157 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26158 MachineBasicBlock *SinkMBB) {
26159 MachineFunction *MF = TrueMBB->getParent();
26160 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26161 DebugLoc DL = MIItBegin->getDebugLoc();
26162
26163 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26164 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26165
26166 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26167
26168 // As we are creating the PHIs, we have to be careful if there is more than
26169 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26170 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26171 // That also means that PHI construction must work forward from earlier to
26172 // later, and that the code must maintain a mapping from earlier PHI's
26173 // destination registers, and the registers that went into the PHI.
26174 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26175 MachineInstrBuilder MIB;
26176
26177 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26178 unsigned DestReg = MIIt->getOperand(0).getReg();
26179 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26180 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26181
26182 // If this CMOV we are generating is the opposite condition from
26183 // the jump we generated, then we have to swap the operands for the
26184 // PHI that is going to be generated.
26185 if (MIIt->getOperand(3).getImm() == OppCC)
26186 std::swap(Op1Reg, Op2Reg);
26187
26188 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26189 Op1Reg = RegRewriteTable[Op1Reg].first;
26190
26191 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26192 Op2Reg = RegRewriteTable[Op2Reg].second;
26193
26194 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26195 .addReg(Op1Reg)
26196 .addMBB(FalseMBB)
26197 .addReg(Op2Reg)
26198 .addMBB(TrueMBB);
26199
26200 // Add this PHI to the rewrite table.
26201 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26202 }
26203
26204 return MIB;
26205}
26206
26207// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26208MachineBasicBlock *
26209X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26210 MachineInstr &SecondCascadedCMOV,
26211 MachineBasicBlock *ThisMBB) const {
26212 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26213 DebugLoc DL = FirstCMOV.getDebugLoc();
26214
26215 // We lower cascaded CMOVs such as
26216 //
26217 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26218 //
26219 // to two successive branches.
26220 //
26221 // Without this, we would add a PHI between the two jumps, which ends up
26222 // creating a few copies all around. For instance, for
26223 //
26224 // (sitofp (zext (fcmp une)))
26225 //
26226 // we would generate:
26227 //
26228 // ucomiss %xmm1, %xmm0
26229 // movss <1.0f>, %xmm0
26230 // movaps %xmm0, %xmm1
26231 // jne .LBB5_2
26232 // xorps %xmm1, %xmm1
26233 // .LBB5_2:
26234 // jp .LBB5_4
26235 // movaps %xmm1, %xmm0
26236 // .LBB5_4:
26237 // retq
26238 //
26239 // because this custom-inserter would have generated:
26240 //
26241 // A
26242 // | \
26243 // | B
26244 // | /
26245 // C
26246 // | \
26247 // | D
26248 // | /
26249 // E
26250 //
26251 // A: X = ...; Y = ...
26252 // B: empty
26253 // C: Z = PHI [X, A], [Y, B]
26254 // D: empty
26255 // E: PHI [X, C], [Z, D]
26256 //
26257 // If we lower both CMOVs in a single step, we can instead generate:
26258 //
26259 // A
26260 // | \
26261 // | C
26262 // | /|
26263 // |/ |
26264 // | |
26265 // | D
26266 // | /
26267 // E
26268 //
26269 // A: X = ...; Y = ...
26270 // D: empty
26271 // E: PHI [X, A], [X, C], [Y, D]
26272 //
26273 // Which, in our sitofp/fcmp example, gives us something like:
26274 //
26275 // ucomiss %xmm1, %xmm0
26276 // movss <1.0f>, %xmm0
26277 // jne .LBB5_4
26278 // jp .LBB5_4
26279 // xorps %xmm0, %xmm0
26280 // .LBB5_4:
26281 // retq
26282 //
26283
26284 // We lower cascaded CMOV into two successive branches to the same block.
26285 // EFLAGS is used by both, so mark it as live in the second.
26286 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26287 MachineFunction *F = ThisMBB->getParent();
26288 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26289 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26290 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26291
26292 MachineFunction::iterator It = ++ThisMBB->getIterator();
26293 F->insert(It, FirstInsertedMBB);
26294 F->insert(It, SecondInsertedMBB);
26295 F->insert(It, SinkMBB);
26296
26297 // For a cascaded CMOV, we lower it to two successive branches to
26298 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26299 // the FirstInsertedMBB.
26300 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26301
26302 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26303 // live into the sink and copy blocks.
26304 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26305 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26306 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26307 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26308 SinkMBB->addLiveIn(X86::EFLAGS);
26309 }
26310
26311 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26312 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26313 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26314 ThisMBB->end());
26315 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26316
26317 // Fallthrough block for ThisMBB.
26318 ThisMBB->addSuccessor(FirstInsertedMBB);
26319 // The true block target of the first branch is always SinkMBB.
26320 ThisMBB->addSuccessor(SinkMBB);
26321 // Fallthrough block for FirstInsertedMBB.
26322 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26323 // The true block for the branch of FirstInsertedMBB.
26324 FirstInsertedMBB->addSuccessor(SinkMBB);
26325 // This is fallthrough.
26326 SecondInsertedMBB->addSuccessor(SinkMBB);
26327
26328 // Create the conditional branch instructions.
26329 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26330 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26331 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26332
26333 X86::CondCode SecondCC =
26334 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26335 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26336 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26337
26338 // SinkMBB:
26339 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26340 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26341 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26342 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26343 MachineInstrBuilder MIB =
26344 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26345 .addReg(Op1Reg)
26346 .addMBB(SecondInsertedMBB)
26347 .addReg(Op2Reg)
26348 .addMBB(ThisMBB);
26349
26350 // The second SecondInsertedMBB provides the same incoming value as the
26351 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26352 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26353 // Copy the PHI result to the register defined by the second CMOV.
26354 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26355 TII->get(TargetOpcode::COPY),
26356 SecondCascadedCMOV.getOperand(0).getReg())
26357 .addReg(FirstCMOV.getOperand(0).getReg());
26358
26359 // Now remove the CMOVs.
26360 FirstCMOV.eraseFromParent();
26361 SecondCascadedCMOV.eraseFromParent();
26362
26363 return SinkMBB;
26364}
26365
26366MachineBasicBlock *
26367X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26368 MachineBasicBlock *ThisMBB) const {
26369 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26370 DebugLoc DL = MI.getDebugLoc();
26371
26372 // To "insert" a SELECT_CC instruction, we actually have to insert the
26373 // diamond control-flow pattern. The incoming instruction knows the
26374 // destination vreg to set, the condition code register to branch on, the
26375 // true/false values to select between and a branch opcode to use.
26376
26377 // ThisMBB:
26378 // ...
26379 // TrueVal = ...
26380 // cmpTY ccX, r1, r2
26381 // bCC copy1MBB
26382 // fallthrough --> FalseMBB
26383
26384 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26385 // as described above, by inserting a BB, and then making a PHI at the join
26386 // point to select the true and false operands of the CMOV in the PHI.
26387 //
26388 // The code also handles two different cases of multiple CMOV opcodes
26389 // in a row.
26390 //
26391 // Case 1:
26392 // In this case, there are multiple CMOVs in a row, all which are based on
26393 // the same condition setting (or the exact opposite condition setting).
26394 // In this case we can lower all the CMOVs using a single inserted BB, and
26395 // then make a number of PHIs at the join point to model the CMOVs. The only
26396 // trickiness here, is that in a case like:
26397 //
26398 // t2 = CMOV cond1 t1, f1
26399 // t3 = CMOV cond1 t2, f2
26400 //
26401 // when rewriting this into PHIs, we have to perform some renaming on the
26402 // temps since you cannot have a PHI operand refer to a PHI result earlier
26403 // in the same block. The "simple" but wrong lowering would be:
26404 //
26405 // t2 = PHI t1(BB1), f1(BB2)
26406 // t3 = PHI t2(BB1), f2(BB2)
26407 //
26408 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26409 // renaming is to note that on the path through BB1, t2 is really just a
26410 // copy of t1, and do that renaming, properly generating:
26411 //
26412 // t2 = PHI t1(BB1), f1(BB2)
26413 // t3 = PHI t1(BB1), f2(BB2)
26414 //
26415 // Case 2:
26416 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26417 // function - EmitLoweredCascadedSelect.
26418
26419 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26420 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26421 MachineInstr *LastCMOV = &MI;
26422 MachineBasicBlock::iterator NextMIIt =
26423 std::next(MachineBasicBlock::iterator(MI));
26424
26425 // Check for case 1, where there are multiple CMOVs with the same condition
26426 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26427 // number of jumps the most.
26428
26429 if (isCMOVPseudo(MI)) {
26430 // See if we have a string of CMOVS with the same condition.
26431 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26432 (NextMIIt->getOperand(3).getImm() == CC ||
26433 NextMIIt->getOperand(3).getImm() == OppCC)) {
26434 LastCMOV = &*NextMIIt;
26435 ++NextMIIt;
26436 }
26437 }
26438
26439 // This checks for case 2, but only do this if we didn't already find
26440 // case 1, as indicated by LastCMOV == MI.
26441 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26442 NextMIIt->getOpcode() == MI.getOpcode() &&
26443 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26444 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26445 NextMIIt->getOperand(1).isKill()) {
26446 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26447 }
26448
26449 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26450 MachineFunction *F = ThisMBB->getParent();
26451 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26452 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26453
26454 MachineFunction::iterator It = ++ThisMBB->getIterator();
26455 F->insert(It, FalseMBB);
26456 F->insert(It, SinkMBB);
26457
26458 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26459 // live into the sink and copy blocks.
26460 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26461 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26462 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26463 FalseMBB->addLiveIn(X86::EFLAGS);
26464 SinkMBB->addLiveIn(X86::EFLAGS);
26465 }
26466
26467 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26468 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26469 std::next(MachineBasicBlock::iterator(LastCMOV)),
26470 ThisMBB->end());
26471 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26472
26473 // Fallthrough block for ThisMBB.
26474 ThisMBB->addSuccessor(FalseMBB);
26475 // The true block target of the first (or only) branch is always a SinkMBB.
26476 ThisMBB->addSuccessor(SinkMBB);
26477 // Fallthrough block for FalseMBB.
26478 FalseMBB->addSuccessor(SinkMBB);
26479
26480 // Create the conditional branch instruction.
26481 unsigned Opc = X86::GetCondBranchFromCond(CC);
26482 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26483
26484 // SinkMBB:
26485 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26486 // ...
26487 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26488 MachineBasicBlock::iterator MIItEnd =
26489 std::next(MachineBasicBlock::iterator(LastCMOV));
26490 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26491
26492 // Now remove the CMOV(s).
26493 ThisMBB->erase(MIItBegin, MIItEnd);
26494
26495 return SinkMBB;
26496}
26497
26498MachineBasicBlock *
26499X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26500 MachineBasicBlock *BB) const {
26501 // Combine the following atomic floating-point modification pattern:
26502 // a.store(reg OP a.load(acquire), release)
26503 // Transform them into:
26504 // OPss (%gpr), %xmm
26505 // movss %xmm, (%gpr)
26506 // Or sd equivalent for 64-bit operations.
26507 unsigned MOp, FOp;
26508 switch (MI.getOpcode()) {
26509 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP")::llvm::llvm_unreachable_internal("unexpected instr type for EmitLoweredAtomicFP"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26509)
;
26510 case X86::RELEASE_FADD32mr:
26511 FOp = X86::ADDSSrm;
26512 MOp = X86::MOVSSmr;
26513 break;
26514 case X86::RELEASE_FADD64mr:
26515 FOp = X86::ADDSDrm;
26516 MOp = X86::MOVSDmr;
26517 break;
26518 }
26519 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26520 DebugLoc DL = MI.getDebugLoc();
26521 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26522 unsigned ValOpIdx = X86::AddrNumOperands;
26523 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26524 MachineInstrBuilder MIB =
26525 BuildMI(*BB, MI, DL, TII->get(FOp),
26526 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26527 .addReg(VSrc);
26528 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26529 MachineOperand &Operand = MI.getOperand(i);
26530 // Clear any kill flags on register operands as we'll create a second
26531 // instruction using the same address operands.
26532 if (Operand.isReg())
26533 Operand.setIsKill(false);
26534 MIB.add(Operand);
26535 }
26536 MachineInstr *FOpMI = MIB;
26537 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26538 for (int i = 0; i < X86::AddrNumOperands; ++i)
26539 MIB.add(MI.getOperand(i));
26540 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26541 MI.eraseFromParent(); // The pseudo instruction is gone now.
26542 return BB;
26543}
26544
26545MachineBasicBlock *
26546X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26547 MachineBasicBlock *BB) const {
26548 MachineFunction *MF = BB->getParent();
26549 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26550 DebugLoc DL = MI.getDebugLoc();
26551 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26552
26553 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26553, __extension__ __PRETTY_FUNCTION__))
;
26554
26555 const bool Is64Bit = Subtarget.is64Bit();
26556 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26557
26558 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26559 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26560
26561 // BB:
26562 // ... [Till the alloca]
26563 // If stacklet is not large enough, jump to mallocMBB
26564 //
26565 // bumpMBB:
26566 // Allocate by subtracting from RSP
26567 // Jump to continueMBB
26568 //
26569 // mallocMBB:
26570 // Allocate by call to runtime
26571 //
26572 // continueMBB:
26573 // ...
26574 // [rest of original BB]
26575 //
26576
26577 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26578 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26579 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26580
26581 MachineRegisterInfo &MRI = MF->getRegInfo();
26582 const TargetRegisterClass *AddrRegClass =
26583 getRegClassFor(getPointerTy(MF->getDataLayout()));
26584
26585 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26586 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26587 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26588 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26589 sizeVReg = MI.getOperand(1).getReg(),
26590 physSPReg =
26591 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26592
26593 MachineFunction::iterator MBBIter = ++BB->getIterator();
26594
26595 MF->insert(MBBIter, bumpMBB);
26596 MF->insert(MBBIter, mallocMBB);
26597 MF->insert(MBBIter, continueMBB);
26598
26599 continueMBB->splice(continueMBB->begin(), BB,
26600 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26601 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26602
26603 // Add code to the main basic block to check if the stack limit has been hit,
26604 // and if so, jump to mallocMBB otherwise to bumpMBB.
26605 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26606 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26607 .addReg(tmpSPVReg).addReg(sizeVReg);
26608 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26609 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26610 .addReg(SPLimitVReg);
26611 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26612
26613 // bumpMBB simply decreases the stack pointer, since we know the current
26614 // stacklet has enough space.
26615 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26616 .addReg(SPLimitVReg);
26617 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26618 .addReg(SPLimitVReg);
26619 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26620
26621 // Calls into a routine in libgcc to allocate more space from the heap.
26622 const uint32_t *RegMask =
26623 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26624 if (IsLP64) {
26625 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26626 .addReg(sizeVReg);
26627 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26628 .addExternalSymbol("__morestack_allocate_stack_space")
26629 .addRegMask(RegMask)
26630 .addReg(X86::RDI, RegState::Implicit)
26631 .addReg(X86::RAX, RegState::ImplicitDefine);
26632 } else if (Is64Bit) {
26633 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26634 .addReg(sizeVReg);
26635 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26636 .addExternalSymbol("__morestack_allocate_stack_space")
26637 .addRegMask(RegMask)
26638 .addReg(X86::EDI, RegState::Implicit)
26639 .addReg(X86::EAX, RegState::ImplicitDefine);
26640 } else {
26641 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26642 .addImm(12);
26643 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26644 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26645 .addExternalSymbol("__morestack_allocate_stack_space")
26646 .addRegMask(RegMask)
26647 .addReg(X86::EAX, RegState::ImplicitDefine);
26648 }
26649
26650 if (!Is64Bit)
26651 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26652 .addImm(16);
26653
26654 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26655 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26656 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26657
26658 // Set up the CFG correctly.
26659 BB->addSuccessor(bumpMBB);
26660 BB->addSuccessor(mallocMBB);
26661 mallocMBB->addSuccessor(continueMBB);
26662 bumpMBB->addSuccessor(continueMBB);
26663
26664 // Take care of the PHI nodes.
26665 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26666 MI.getOperand(0).getReg())
26667 .addReg(mallocPtrVReg)
26668 .addMBB(mallocMBB)
26669 .addReg(bumpSPPtrVReg)
26670 .addMBB(bumpMBB);
26671
26672 // Delete the original pseudo instruction.
26673 MI.eraseFromParent();
26674
26675 // And we're done.
26676 return continueMBB;
26677}
26678
26679MachineBasicBlock *
26680X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26681 MachineBasicBlock *BB) const {
26682 MachineFunction *MF = BB->getParent();
26683 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26684 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26685 DebugLoc DL = MI.getDebugLoc();
26686
26687 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__))
26688 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__))
26689 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction()->getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26689, __extension__ __PRETTY_FUNCTION__))
;
26690
26691 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26692 if (!Subtarget.is32Bit())
26693 return BB;
26694
26695 // C++ EH creates a new target block to hold the restore code, and wires up
26696 // the new block to the return destination with a normal JMP_4.
26697 MachineBasicBlock *RestoreMBB =
26698 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26699 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26699, __extension__ __PRETTY_FUNCTION__))
;
26700 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26701 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26702 BB->addSuccessor(RestoreMBB);
26703 MI.getOperand(0).setMBB(RestoreMBB);
26704
26705 auto RestoreMBBI = RestoreMBB->begin();
26706 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26707 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26708 return BB;
26709}
26710
26711MachineBasicBlock *
26712X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26713 MachineBasicBlock *BB) const {
26714 MachineFunction *MF = BB->getParent();
26715 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26716 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26717 // Only 32-bit SEH requires special handling for catchpad.
26718 if (IsSEH && Subtarget.is32Bit()) {
26719 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26720 DebugLoc DL = MI.getDebugLoc();
26721 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26722 }
26723 MI.eraseFromParent();
26724 return BB;
26725}
26726
26727MachineBasicBlock *
26728X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26729 MachineBasicBlock *BB) const {
26730 // So, here we replace TLSADDR with the sequence:
26731 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26732 // We need this because TLSADDR is lowered into calls
26733 // inside MC, therefore without the two markers shrink-wrapping
26734 // may push the prologue/epilogue pass them.
26735 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26736 DebugLoc DL = MI.getDebugLoc();
26737 MachineFunction &MF = *BB->getParent();
26738
26739 // Emit CALLSEQ_START right before the instruction.
26740 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26741 MachineInstrBuilder CallseqStart =
26742 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26743 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26744
26745 // Emit CALLSEQ_END right after the instruction.
26746 // We don't call erase from parent because we want to keep the
26747 // original instruction around.
26748 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26749 MachineInstrBuilder CallseqEnd =
26750 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26751 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26752
26753 return BB;
26754}
26755
26756MachineBasicBlock *
26757X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26758 MachineBasicBlock *BB) const {
26759 // This is pretty easy. We're taking the value that we received from
26760 // our load from the relocation, sticking it in either RDI (x86-64)
26761 // or EAX and doing an indirect call. The return value will then
26762 // be in the normal return register.
26763 MachineFunction *F = BB->getParent();
26764 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26765 DebugLoc DL = MI.getDebugLoc();
26766
26767 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26767, __extension__ __PRETTY_FUNCTION__))
;
26768 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26768, __extension__ __PRETTY_FUNCTION__))
;
26769
26770 // Get a register mask for the lowered call.
26771 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26772 // proper register mask.
26773 const uint32_t *RegMask =
26774 Subtarget.is64Bit() ?
26775 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26776 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26777 if (Subtarget.is64Bit()) {
26778 MachineInstrBuilder MIB =
26779 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26780 .addReg(X86::RIP)
26781 .addImm(0)
26782 .addReg(0)
26783 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26784 MI.getOperand(3).getTargetFlags())
26785 .addReg(0);
26786 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26787 addDirectMem(MIB, X86::RDI);
26788 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26789 } else if (!isPositionIndependent()) {
26790 MachineInstrBuilder MIB =
26791 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26792 .addReg(0)
26793 .addImm(0)
26794 .addReg(0)
26795 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26796 MI.getOperand(3).getTargetFlags())
26797 .addReg(0);
26798 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26799 addDirectMem(MIB, X86::EAX);
26800 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26801 } else {
26802 MachineInstrBuilder MIB =
26803 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26804 .addReg(TII->getGlobalBaseReg(F))
26805 .addImm(0)
26806 .addReg(0)
26807 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26808 MI.getOperand(3).getTargetFlags())
26809 .addReg(0);
26810 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26811 addDirectMem(MIB, X86::EAX);
26812 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26813 }
26814
26815 MI.eraseFromParent(); // The pseudo instruction is gone now.
26816 return BB;
26817}
26818
26819MachineBasicBlock *
26820X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26821 MachineBasicBlock *MBB) const {
26822 DebugLoc DL = MI.getDebugLoc();
26823 MachineFunction *MF = MBB->getParent();
26824 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26825 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26826 MachineRegisterInfo &MRI = MF->getRegInfo();
26827
26828 const BasicBlock *BB = MBB->getBasicBlock();
26829 MachineFunction::iterator I = ++MBB->getIterator();
26830
26831 // Memory Reference
26832 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26833 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26834
26835 unsigned DstReg;
26836 unsigned MemOpndSlot = 0;
26837
26838 unsigned CurOp = 0;
26839
26840 DstReg = MI.getOperand(CurOp++).getReg();
26841 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26842 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26842, __extension__ __PRETTY_FUNCTION__))
;
26843 (void)TRI;
26844 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26845 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26846
26847 MemOpndSlot = CurOp;
26848
26849 MVT PVT = getPointerTy(MF->getDataLayout());
26850 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26851, __extension__ __PRETTY_FUNCTION__))
26851 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26851, __extension__ __PRETTY_FUNCTION__))
;
26852
26853 // For v = setjmp(buf), we generate
26854 //
26855 // thisMBB:
26856 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26857 // SjLjSetup restoreMBB
26858 //
26859 // mainMBB:
26860 // v_main = 0
26861 //
26862 // sinkMBB:
26863 // v = phi(main, restore)
26864 //
26865 // restoreMBB:
26866 // if base pointer being used, load it from frame
26867 // v_restore = 1
26868
26869 MachineBasicBlock *thisMBB = MBB;
26870 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26871 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26872 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26873 MF->insert(I, mainMBB);
26874 MF->insert(I, sinkMBB);
26875 MF->push_back(restoreMBB);
26876 restoreMBB->setHasAddressTaken();
26877
26878 MachineInstrBuilder MIB;
26879
26880 // Transfer the remainder of BB and its successor edges to sinkMBB.
26881 sinkMBB->splice(sinkMBB->begin(), MBB,
26882 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26883 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26884
26885 // thisMBB:
26886 unsigned PtrStoreOpc = 0;
26887 unsigned LabelReg = 0;
26888 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26889 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26890 !isPositionIndependent();
26891
26892 // Prepare IP either in reg or imm.
26893 if (!UseImmLabel) {
26894 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26895 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26896 LabelReg = MRI.createVirtualRegister(PtrRC);
26897 if (Subtarget.is64Bit()) {
26898 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26899 .addReg(X86::RIP)
26900 .addImm(0)
26901 .addReg(0)
26902 .addMBB(restoreMBB)
26903 .addReg(0);
26904 } else {
26905 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26906 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26907 .addReg(XII->getGlobalBaseReg(MF))
26908 .addImm(0)
26909 .addReg(0)
26910 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26911 .addReg(0);
26912 }
26913 } else
26914 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26915 // Store IP
26916 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26917 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26918 if (i == X86::AddrDisp)
26919 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26920 else
26921 MIB.add(MI.getOperand(MemOpndSlot + i));
26922 }
26923 if (!UseImmLabel)
26924 MIB.addReg(LabelReg);
26925 else
26926 MIB.addMBB(restoreMBB);
26927 MIB.setMemRefs(MMOBegin, MMOEnd);
26928 // Setup
26929 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26930 .addMBB(restoreMBB);
26931
26932 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26933 MIB.addRegMask(RegInfo->getNoPreservedMask());
26934 thisMBB->addSuccessor(mainMBB);
26935 thisMBB->addSuccessor(restoreMBB);
26936
26937 // mainMBB:
26938 // EAX = 0
26939 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26940 mainMBB->addSuccessor(sinkMBB);
26941
26942 // sinkMBB:
26943 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26944 TII->get(X86::PHI), DstReg)
26945 .addReg(mainDstReg).addMBB(mainMBB)
26946 .addReg(restoreDstReg).addMBB(restoreMBB);
26947
26948 // restoreMBB:
26949 if (RegInfo->hasBasePointer(*MF)) {
26950 const bool Uses64BitFramePtr =
26951 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26952 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26953 X86FI->setRestoreBasePointer(MF);
26954 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26955 unsigned BasePtr = RegInfo->getBaseRegister();
26956 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26957 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26958 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26959 .setMIFlag(MachineInstr::FrameSetup);
26960 }
26961 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26962 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26963 restoreMBB->addSuccessor(sinkMBB);
26964
26965 MI.eraseFromParent();
26966 return sinkMBB;
26967}
26968
26969MachineBasicBlock *
26970X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26971 MachineBasicBlock *MBB) const {
26972 DebugLoc DL = MI.getDebugLoc();
26973 MachineFunction *MF = MBB->getParent();
26974 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26975 MachineRegisterInfo &MRI = MF->getRegInfo();
26976
26977 // Memory Reference
26978 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26979 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26980
26981 MVT PVT = getPointerTy(MF->getDataLayout());
26982 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26983, __extension__ __PRETTY_FUNCTION__))
26983 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 26983, __extension__ __PRETTY_FUNCTION__))
;
26984
26985 const TargetRegisterClass *RC =
26986 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26987 unsigned Tmp = MRI.createVirtualRegister(RC);
26988 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26989 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26990 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26991 unsigned SP = RegInfo->getStackRegister();
26992
26993 MachineInstrBuilder MIB;
26994
26995 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26996 const int64_t SPOffset = 2 * PVT.getStoreSize();
26997
26998 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26999 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27000
27001 // Reload FP
27002 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27003 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27004 MIB.add(MI.getOperand(i));
27005 MIB.setMemRefs(MMOBegin, MMOEnd);
27006 // Reload IP
27007 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27008 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27009 if (i == X86::AddrDisp)
27010 MIB.addDisp(MI.getOperand(i), LabelOffset);
27011 else
27012 MIB.add(MI.getOperand(i));
27013 }
27014 MIB.setMemRefs(MMOBegin, MMOEnd);
27015 // Reload SP
27016 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27017 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27018 if (i == X86::AddrDisp)
27019 MIB.addDisp(MI.getOperand(i), SPOffset);
27020 else
27021 MIB.add(MI.getOperand(i));
27022 }
27023 MIB.setMemRefs(MMOBegin, MMOEnd);
27024 // Jump
27025 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27026
27027 MI.eraseFromParent();
27028 return MBB;
27029}
27030
27031void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27032 MachineBasicBlock *MBB,
27033 MachineBasicBlock *DispatchBB,
27034 int FI) const {
27035 DebugLoc DL = MI.getDebugLoc();
27036 MachineFunction *MF = MBB->getParent();
27037 MachineRegisterInfo *MRI = &MF->getRegInfo();
27038 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27039
27040 MVT PVT = getPointerTy(MF->getDataLayout());
27041 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27041, __extension__ __PRETTY_FUNCTION__))
;
27042
27043 unsigned Op = 0;
27044 unsigned VR = 0;
27045
27046 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27047 !isPositionIndependent();
27048
27049 if (UseImmLabel) {
27050 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27051 } else {
27052 const TargetRegisterClass *TRC =
27053 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27054 VR = MRI->createVirtualRegister(TRC);
27055 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27056
27057 if (Subtarget.is64Bit())
27058 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27059 .addReg(X86::RIP)
27060 .addImm(1)
27061 .addReg(0)
27062 .addMBB(DispatchBB)
27063 .addReg(0);
27064 else
27065 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27066 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27067 .addImm(1)
27068 .addReg(0)
27069 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27070 .addReg(0);
27071 }
27072
27073 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27074 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27075 if (UseImmLabel)
27076 MIB.addMBB(DispatchBB);
27077 else
27078 MIB.addReg(VR);
27079}
27080
27081MachineBasicBlock *
27082X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27083 MachineBasicBlock *BB) const {
27084 DebugLoc DL = MI.getDebugLoc();
27085 MachineFunction *MF = BB->getParent();
27086 MachineFrameInfo &MFI = MF->getFrameInfo();
27087 MachineRegisterInfo *MRI = &MF->getRegInfo();
27088 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27089 int FI = MFI.getFunctionContextIndex();
27090
27091 // Get a mapping of the call site numbers to all of the landing pads they're
27092 // associated with.
27093 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27094 unsigned MaxCSNum = 0;
27095 for (auto &MBB : *MF) {
27096 if (!MBB.isEHPad())
27097 continue;
27098
27099 MCSymbol *Sym = nullptr;
27100 for (const auto &MI : MBB) {
27101 if (MI.isDebugValue())
27102 continue;
27103
27104 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27104, __extension__ __PRETTY_FUNCTION__))
;
27105 Sym = MI.getOperand(0).getMCSymbol();
27106 break;
27107 }
27108
27109 if (!MF->hasCallSiteLandingPad(Sym))
27110 continue;
27111
27112 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27113 CallSiteNumToLPad[CSI].push_back(&MBB);
27114 MaxCSNum = std::max(MaxCSNum, CSI);
27115 }
27116 }
27117
27118 // Get an ordered list of the machine basic blocks for the jump table.
27119 std::vector<MachineBasicBlock *> LPadList;
27120 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27121 LPadList.reserve(CallSiteNumToLPad.size());
27122
27123 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27124 for (auto &LP : CallSiteNumToLPad[CSI]) {
27125 LPadList.push_back(LP);
27126 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27127 }
27128 }
27129
27130 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27131, __extension__ __PRETTY_FUNCTION__))
27131 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27131, __extension__ __PRETTY_FUNCTION__))
;
27132
27133 // Create the MBBs for the dispatch code.
27134
27135 // Shove the dispatch's address into the return slot in the function context.
27136 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27137 DispatchBB->setIsEHPad(true);
27138
27139 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27140 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27141 DispatchBB->addSuccessor(TrapBB);
27142
27143 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27144 DispatchBB->addSuccessor(DispContBB);
27145
27146 // Insert MBBs.
27147 MF->push_back(DispatchBB);
27148 MF->push_back(DispContBB);
27149 MF->push_back(TrapBB);
27150
27151 // Insert code into the entry block that creates and registers the function
27152 // context.
27153 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27154
27155 // Create the jump table and associated information
27156 unsigned JTE = getJumpTableEncoding();
27157 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27158 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27159
27160 const X86RegisterInfo &RI = TII->getRegisterInfo();
27161 // Add a register mask with no preserved registers. This results in all
27162 // registers being marked as clobbered.
27163 if (RI.hasBasePointer(*MF)) {
27164 const bool FPIs64Bit =
27165 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27166 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27167 MFI->setRestoreBasePointer(MF);
27168
27169 unsigned FP = RI.getFrameRegister(*MF);
27170 unsigned BP = RI.getBaseRegister();
27171 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27172 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27173 MFI->getRestoreBasePointerOffset())
27174 .addRegMask(RI.getNoPreservedMask());
27175 } else {
27176 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27177 .addRegMask(RI.getNoPreservedMask());
27178 }
27179
27180 // IReg is used as an index in a memory operand and therefore can't be SP
27181 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27182 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27183 Subtarget.is64Bit() ? 8 : 4);
27184 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27185 .addReg(IReg)
27186 .addImm(LPadList.size());
27187 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27188
27189 if (Subtarget.is64Bit()) {
27190 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27191 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27192
27193 // leaq .LJTI0_0(%rip), BReg
27194 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27195 .addReg(X86::RIP)
27196 .addImm(1)
27197 .addReg(0)
27198 .addJumpTableIndex(MJTI)
27199 .addReg(0);
27200 // movzx IReg64, IReg
27201 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27202 .addImm(0)
27203 .addReg(IReg)
27204 .addImm(X86::sub_32bit);
27205
27206 switch (JTE) {
27207 case MachineJumpTableInfo::EK_BlockAddress:
27208 // jmpq *(BReg,IReg64,8)
27209 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27210 .addReg(BReg)
27211 .addImm(8)
27212 .addReg(IReg64)
27213 .addImm(0)
27214 .addReg(0);
27215 break;
27216 case MachineJumpTableInfo::EK_LabelDifference32: {
27217 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27218 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27219 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27220
27221 // movl (BReg,IReg64,4), OReg
27222 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27223 .addReg(BReg)
27224 .addImm(4)
27225 .addReg(IReg64)
27226 .addImm(0)
27227 .addReg(0);
27228 // movsx OReg64, OReg
27229 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27230 // addq BReg, OReg64, TReg
27231 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27232 .addReg(OReg64)
27233 .addReg(BReg);
27234 // jmpq *TReg
27235 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27236 break;
27237 }
27238 default:
27239 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27239)
;
27240 }
27241 } else {
27242 // jmpl *.LJTI0_0(,IReg,4)
27243 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27244 .addReg(0)
27245 .addImm(4)
27246 .addReg(IReg)
27247 .addJumpTableIndex(MJTI)
27248 .addReg(0);
27249 }
27250
27251 // Add the jump table entries as successors to the MBB.
27252 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27253 for (auto &LP : LPadList)
27254 if (SeenMBBs.insert(LP).second)
27255 DispContBB->addSuccessor(LP);
27256
27257 // N.B. the order the invoke BBs are processed in doesn't matter here.
27258 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27259 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27260 for (MachineBasicBlock *MBB : InvokeBBs) {
27261 // Remove the landing pad successor from the invoke block and replace it
27262 // with the new dispatch block.
27263 // Keep a copy of Successors since it's modified inside the loop.
27264 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27265 MBB->succ_rend());
27266 // FIXME: Avoid quadratic complexity.
27267 for (auto MBBS : Successors) {
27268 if (MBBS->isEHPad()) {
27269 MBB->removeSuccessor(MBBS);
27270 MBBLPads.push_back(MBBS);
27271 }
27272 }
27273
27274 MBB->addSuccessor(DispatchBB);
27275
27276 // Find the invoke call and mark all of the callee-saved registers as
27277 // 'implicit defined' so that they're spilled. This prevents code from
27278 // moving instructions to before the EH block, where they will never be
27279 // executed.
27280 for (auto &II : reverse(*MBB)) {
27281 if (!II.isCall())
27282 continue;
27283
27284 DenseMap<unsigned, bool> DefRegs;
27285 for (auto &MOp : II.operands())
27286 if (MOp.isReg())
27287 DefRegs[MOp.getReg()] = true;
27288
27289 MachineInstrBuilder MIB(*MF, &II);
27290 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27291 unsigned Reg = SavedRegs[RI];
27292 if (!DefRegs[Reg])
27293 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27294 }
27295
27296 break;
27297 }
27298 }
27299
27300 // Mark all former landing pads as non-landing pads. The dispatch is the only
27301 // landing pad now.
27302 for (auto &LP : MBBLPads)
27303 LP->setIsEHPad(false);
27304
27305 // The instruction is gone now.
27306 MI.eraseFromParent();
27307 return BB;
27308}
27309
27310MachineBasicBlock *
27311X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27312 MachineBasicBlock *BB) const {
27313 MachineFunction *MF = BB->getParent();
27314 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27315 DebugLoc DL = MI.getDebugLoc();
27316
27317 switch (MI.getOpcode()) {
27318 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27318)
;
27319 case X86::TAILJMPd64:
27320 case X86::TAILJMPr64:
27321 case X86::TAILJMPm64:
27322 case X86::TAILJMPr64_REX:
27323 case X86::TAILJMPm64_REX:
27324 llvm_unreachable("TAILJMP64 would not be touched here.")::llvm::llvm_unreachable_internal("TAILJMP64 would not be touched here."
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27324)
;
27325 case X86::TCRETURNdi64:
27326 case X86::TCRETURNri64:
27327 case X86::TCRETURNmi64:
27328 return BB;
27329 case X86::TLS_addr32:
27330 case X86::TLS_addr64:
27331 case X86::TLS_base_addr32:
27332 case X86::TLS_base_addr64:
27333 return EmitLoweredTLSAddr(MI, BB);
27334 case X86::CATCHRET:
27335 return EmitLoweredCatchRet(MI, BB);
27336 case X86::CATCHPAD:
27337 return EmitLoweredCatchPad(MI, BB);
27338 case X86::SEG_ALLOCA_32:
27339 case X86::SEG_ALLOCA_64:
27340 return EmitLoweredSegAlloca(MI, BB);
27341 case X86::TLSCall_32:
27342 case X86::TLSCall_64:
27343 return EmitLoweredTLSCall(MI, BB);
27344 case X86::CMOV_FR32:
27345 case X86::CMOV_FR64:
27346 case X86::CMOV_FR128:
27347 case X86::CMOV_GR8:
27348 case X86::CMOV_GR16:
27349 case X86::CMOV_GR32:
27350 case X86::CMOV_RFP32:
27351 case X86::CMOV_RFP64:
27352 case X86::CMOV_RFP80:
27353 case X86::CMOV_V2F64:
27354 case X86::CMOV_V2I64:
27355 case X86::CMOV_V4F32:
27356 case X86::CMOV_V4F64:
27357 case X86::CMOV_V4I64:
27358 case X86::CMOV_V16F32:
27359 case X86::CMOV_V8F32:
27360 case X86::CMOV_V8F64:
27361 case X86::CMOV_V8I64:
27362 case X86::CMOV_V8I1:
27363 case X86::CMOV_V16I1:
27364 case X86::CMOV_V32I1:
27365 case X86::CMOV_V64I1:
27366 return EmitLoweredSelect(MI, BB);
27367
27368 case X86::RDFLAGS32:
27369 case X86::RDFLAGS64: {
27370 unsigned PushF =
27371 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27372 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27373 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27374 // Permit reads of the FLAGS register without it being defined.
27375 // This intrinsic exists to read external processor state in flags, such as
27376 // the trap flag, interrupt flag, and direction flag, none of which are
27377 // modeled by the backend.
27378 Push->getOperand(2).setIsUndef();
27379 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27380
27381 MI.eraseFromParent(); // The pseudo is gone now.
27382 return BB;
27383 }
27384
27385 case X86::WRFLAGS32:
27386 case X86::WRFLAGS64: {
27387 unsigned Push =
27388 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27389 unsigned PopF =
27390 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27391 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27392 BuildMI(*BB, MI, DL, TII->get(PopF));
27393
27394 MI.eraseFromParent(); // The pseudo is gone now.
27395 return BB;
27396 }
27397
27398 case X86::RELEASE_FADD32mr:
27399 case X86::RELEASE_FADD64mr:
27400 return EmitLoweredAtomicFP(MI, BB);
27401
27402 case X86::FP32_TO_INT16_IN_MEM:
27403 case X86::FP32_TO_INT32_IN_MEM:
27404 case X86::FP32_TO_INT64_IN_MEM:
27405 case X86::FP64_TO_INT16_IN_MEM:
27406 case X86::FP64_TO_INT32_IN_MEM:
27407 case X86::FP64_TO_INT64_IN_MEM:
27408 case X86::FP80_TO_INT16_IN_MEM:
27409 case X86::FP80_TO_INT32_IN_MEM:
27410 case X86::FP80_TO_INT64_IN_MEM: {
27411 // Change the floating point control register to use "round towards zero"
27412 // mode when truncating to an integer value.
27413 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27414 addFrameReference(BuildMI(*BB, MI, DL,
27415 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27416
27417 // Load the old value of the high byte of the control word...
27418 unsigned OldCW =
27419 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27420 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27421 CWFrameIdx);
27422
27423 // Set the high part to be round to zero...
27424 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27425 .addImm(0xC7F);
27426
27427 // Reload the modified control word now...
27428 addFrameReference(BuildMI(*BB, MI, DL,
27429 TII->get(X86::FLDCW16m)), CWFrameIdx);
27430
27431 // Restore the memory image of control word to original value
27432 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27433 .addReg(OldCW);
27434
27435 // Get the X86 opcode to use.
27436 unsigned Opc;
27437 switch (MI.getOpcode()) {
27438 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27438)
;
27439 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27440 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27441 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27442 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27443 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27444 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27445 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27446 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27447 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27448 }
27449
27450 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27451 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27452 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27453
27454 // Reload the original control word now.
27455 addFrameReference(BuildMI(*BB, MI, DL,
27456 TII->get(X86::FLDCW16m)), CWFrameIdx);
27457
27458 MI.eraseFromParent(); // The pseudo instruction is gone now.
27459 return BB;
27460 }
27461 // String/text processing lowering.
27462 case X86::PCMPISTRM128REG:
27463 case X86::VPCMPISTRM128REG:
27464 case X86::PCMPISTRM128MEM:
27465 case X86::VPCMPISTRM128MEM:
27466 case X86::PCMPESTRM128REG:
27467 case X86::VPCMPESTRM128REG:
27468 case X86::PCMPESTRM128MEM:
27469 case X86::VPCMPESTRM128MEM:
27470 assert(Subtarget.hasSSE42() &&(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27471, __extension__ __PRETTY_FUNCTION__))
27471 "Target must have SSE4.2 or AVX features enabled")(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27471, __extension__ __PRETTY_FUNCTION__))
;
27472 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27473
27474 // String/text processing lowering.
27475 case X86::PCMPISTRIREG:
27476 case X86::VPCMPISTRIREG:
27477 case X86::PCMPISTRIMEM:
27478 case X86::VPCMPISTRIMEM:
27479 case X86::PCMPESTRIREG:
27480 case X86::VPCMPESTRIREG:
27481 case X86::PCMPESTRIMEM:
27482 case X86::VPCMPESTRIMEM:
27483 assert(Subtarget.hasSSE42() &&(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27484, __extension__ __PRETTY_FUNCTION__))
27484 "Target must have SSE4.2 or AVX features enabled")(static_cast <bool> (Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"
) ? void (0) : __assert_fail ("Subtarget.hasSSE42() && \"Target must have SSE4.2 or AVX features enabled\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27484, __extension__ __PRETTY_FUNCTION__))
;
27485 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27486
27487 // Thread synchronization.
27488 case X86::MONITOR:
27489 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27490 case X86::MONITORX:
27491 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27492
27493 // Cache line zero
27494 case X86::CLZERO:
27495 return emitClzero(&MI, BB, Subtarget);
27496
27497 // PKU feature
27498 case X86::WRPKRU:
27499 return emitWRPKRU(MI, BB, Subtarget);
27500 case X86::RDPKRU:
27501 return emitRDPKRU(MI, BB, Subtarget);
27502 // xbegin
27503 case X86::XBEGIN:
27504 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27505
27506 case X86::VASTART_SAVE_XMM_REGS:
27507 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27508
27509 case X86::VAARG_64:
27510 return EmitVAARG64WithCustomInserter(MI, BB);
27511
27512 case X86::EH_SjLj_SetJmp32:
27513 case X86::EH_SjLj_SetJmp64:
27514 return emitEHSjLjSetJmp(MI, BB);
27515
27516 case X86::EH_SjLj_LongJmp32:
27517 case X86::EH_SjLj_LongJmp64:
27518 return emitEHSjLjLongJmp(MI, BB);
27519
27520 case X86::Int_eh_sjlj_setup_dispatch:
27521 return EmitSjLjDispatchBlock(MI, BB);
27522
27523 case TargetOpcode::STATEPOINT:
27524 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27525 // this point in the process. We diverge later.
27526 return emitPatchPoint(MI, BB);
27527
27528 case TargetOpcode::STACKMAP:
27529 case TargetOpcode::PATCHPOINT:
27530 return emitPatchPoint(MI, BB);
27531
27532 case TargetOpcode::PATCHABLE_EVENT_CALL:
27533 // Do nothing here, handle in xray instrumentation pass.
27534 return BB;
27535
27536 case X86::LCMPXCHG8B: {
27537 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27538 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27539 // requires a memory operand. If it happens that current architecture is
27540 // i686 and for current function we need a base pointer
27541 // - which is ESI for i686 - register allocator would not be able to
27542 // allocate registers for an address in form of X(%reg, %reg, Y)
27543 // - there never would be enough unreserved registers during regalloc
27544 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27545 // We are giving a hand to register allocator by precomputing the address in
27546 // a new vreg using LEA.
27547
27548 // If it is not i686 or there is no base pointer - nothing to do here.
27549 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27550 return BB;
27551
27552 // Even though this code does not necessarily needs the base pointer to
27553 // be ESI, we check for that. The reason: if this assert fails, there are
27554 // some changes happened in the compiler base pointer handling, which most
27555 // probably have to be addressed somehow here.
27556 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__))
27557 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__))
27558 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27558, __extension__ __PRETTY_FUNCTION__))
;
27559
27560 MachineRegisterInfo &MRI = MF->getRegInfo();
27561 MVT SPTy = getPointerTy(MF->getDataLayout());
27562 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27563 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27564
27565 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27566 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27567 // does not use index register.
27568 if (AM.IndexReg == X86::NoRegister)
27569 return BB;
27570
27571 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27572 // four operand definitions that are E[ABCD] registers. We skip them and
27573 // then insert the LEA.
27574 MachineBasicBlock::iterator MBBI(MI);
27575 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27576 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27577 --MBBI;
27578 addFullAddress(
27579 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27580
27581 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
27582
27583 return BB;
27584 }
27585 case X86::LCMPXCHG16B:
27586 return BB;
27587 case X86::LCMPXCHG8B_SAVE_EBX:
27588 case X86::LCMPXCHG16B_SAVE_RBX: {
27589 unsigned BasePtr =
27590 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
27591 if (!BB->isLiveIn(BasePtr))
27592 BB->addLiveIn(BasePtr);
27593 return BB;
27594 }
27595 }
27596}
27597
27598//===----------------------------------------------------------------------===//
27599// X86 Optimization Hooks
27600//===----------------------------------------------------------------------===//
27601
27602void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
27603 KnownBits &Known,
27604 const APInt &DemandedElts,
27605 const SelectionDAG &DAG,
27606 unsigned Depth) const {
27607 unsigned BitWidth = Known.getBitWidth();
27608 unsigned Opc = Op.getOpcode();
27609 EVT VT = Op.getValueType();
27610 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
27611 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
27612 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
27613 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
27614 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
27615 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27615, __extension__ __PRETTY_FUNCTION__))
;
27616
27617 Known.resetAll();
27618 switch (Opc) {
27619 default: break;
27620 case X86ISD::SETCC:
27621 Known.Zero.setBitsFrom(1);
27622 break;
27623 case X86ISD::MOVMSK: {
27624 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
27625 Known.Zero.setBitsFrom(NumLoBits);
27626 break;
27627 }
27628 case X86ISD::PEXTRB:
27629 case X86ISD::PEXTRW: {
27630 SDValue Src = Op.getOperand(0);
27631 EVT SrcVT = Src.getValueType();
27632 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
27633 Op.getConstantOperandVal(1));
27634 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
27635 Known = Known.zextOrTrunc(BitWidth);
27636 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
27637 break;
27638 }
27639 case X86ISD::VSHLI:
27640 case X86ISD::VSRLI: {
27641 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27642 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
27643 Known.setAllZero();
27644 break;
27645 }
27646
27647 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
27648 unsigned ShAmt = ShiftImm->getZExtValue();
27649 if (Opc == X86ISD::VSHLI) {
27650 Known.Zero <<= ShAmt;
27651 Known.One <<= ShAmt;
27652 // Low bits are known zero.
27653 Known.Zero.setLowBits(ShAmt);
27654 } else {
27655 Known.Zero.lshrInPlace(ShAmt);
27656 Known.One.lshrInPlace(ShAmt);
27657 // High bits are known zero.
27658 Known.Zero.setHighBits(ShAmt);
27659 }
27660 }
27661 break;
27662 }
27663 case X86ISD::VZEXT: {
27664 // TODO: Add DemandedElts support.
27665 SDValue N0 = Op.getOperand(0);
27666 unsigned NumElts = VT.getVectorNumElements();
27667
27668 EVT SrcVT = N0.getValueType();
27669 unsigned InNumElts = SrcVT.getVectorNumElements();
27670 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27671 assert(InNumElts >= NumElts && "Illegal VZEXT input")(static_cast <bool> (InNumElts >= NumElts &&
"Illegal VZEXT input") ? void (0) : __assert_fail ("InNumElts >= NumElts && \"Illegal VZEXT input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27671, __extension__ __PRETTY_FUNCTION__))
;
27672
27673 Known = KnownBits(InBitWidth);
27674 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27675 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27676 Known = Known.zext(BitWidth);
27677 Known.Zero.setBitsFrom(InBitWidth);
27678 break;
27679 }
27680 case X86ISD::CMOV: {
27681 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
27682 // If we don't know any bits, early out.
27683 if (Known.isUnknown())
27684 break;
27685 KnownBits Known2;
27686 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
27687
27688 // Only known if known in both the LHS and RHS.
27689 Known.One &= Known2.One;
27690 Known.Zero &= Known2.Zero;
27691 break;
27692 }
27693 case X86ISD::UDIVREM8_ZEXT_HREG:
27694 // TODO: Support more than just the zero extended bits?
27695 if (Op.getResNo() != 1)
27696 break;
27697 // The remainder is zero extended.
27698 Known.Zero.setBitsFrom(8);
27699 break;
27700 }
27701}
27702
27703unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27704 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27705 unsigned Depth) const {
27706 unsigned VTBits = Op.getScalarValueSizeInBits();
27707 unsigned Opcode = Op.getOpcode();
27708 switch (Opcode) {
27709 case X86ISD::SETCC_CARRY:
27710 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27711 return VTBits;
27712
27713 case X86ISD::VSEXT: {
27714 // TODO: Add DemandedElts support.
27715 SDValue Src = Op.getOperand(0);
27716 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27717 Tmp += VTBits - Src.getScalarValueSizeInBits();
27718 return Tmp;
27719 }
27720
27721 case X86ISD::VTRUNC: {
27722 // TODO: Add DemandedElts support.
27723 SDValue Src = Op.getOperand(0);
27724 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
27725 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27725, __extension__ __PRETTY_FUNCTION__))
;
27726 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27727 if (Tmp > (NumSrcBits - VTBits))
27728 return Tmp - (NumSrcBits - VTBits);
27729 return 1;
27730 }
27731
27732 case X86ISD::PACKSS: {
27733 // PACKSS is just a truncation if the sign bits extend to the packed size.
27734 // TODO: Add DemandedElts support.
27735 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
27736 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
27737 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
27738 unsigned Tmp = std::min(Tmp0, Tmp1);
27739 if (Tmp > (SrcBits - VTBits))
27740 return Tmp - (SrcBits - VTBits);
27741 return 1;
27742 }
27743
27744 case X86ISD::VSHLI: {
27745 SDValue Src = Op.getOperand(0);
27746 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27747 if (ShiftVal.uge(VTBits))
27748 return VTBits; // Shifted all bits out --> zero.
27749 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27750 if (ShiftVal.uge(Tmp))
27751 return 1; // Shifted all sign bits out --> unknown.
27752 return Tmp - ShiftVal.getZExtValue();
27753 }
27754
27755 case X86ISD::VSRAI: {
27756 SDValue Src = Op.getOperand(0);
27757 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27758 if (ShiftVal.uge(VTBits - 1))
27759 return VTBits; // Sign splat.
27760 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27761 ShiftVal += Tmp;
27762 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27763 }
27764
27765 case X86ISD::PCMPGT:
27766 case X86ISD::PCMPEQ:
27767 case X86ISD::CMPP:
27768 case X86ISD::VPCOM:
27769 case X86ISD::VPCOMU:
27770 // Vector compares return zero/all-bits result values.
27771 return VTBits;
27772
27773 case X86ISD::CMOV: {
27774 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
27775 if (Tmp0 == 1) return 1; // Early out.
27776 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
27777 return std::min(Tmp0, Tmp1);
27778 }
27779 case X86ISD::SDIVREM8_SEXT_HREG:
27780 // TODO: Support more than just the sign extended bits?
27781 if (Op.getResNo() != 1)
27782 break;
27783 // The remainder is sign extended.
27784 return VTBits - 7;
27785 }
27786
27787 // Fallback case.
27788 return 1;
27789}
27790
27791SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
27792 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
27793 return N->getOperand(0);
27794 return N;
27795}
27796
27797/// Returns true (and the GlobalValue and the offset) if the node is a
27798/// GlobalAddress + offset.
27799bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27800 const GlobalValue* &GA,
27801 int64_t &Offset) const {
27802 if (N->getOpcode() == X86ISD::Wrapper) {
27803 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27804 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27805 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27806 return true;
27807 }
27808 }
27809 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27810}
27811
27812// Attempt to match a combined shuffle mask against supported unary shuffle
27813// instructions.
27814// TODO: Investigate sharing more of this with shuffle lowering.
27815static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27816 bool AllowFloatDomain, bool AllowIntDomain,
27817 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27818 const X86Subtarget &Subtarget,
27819 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27820 unsigned NumMaskElts = Mask.size();
27821 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27822
27823 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27824 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27825 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27826 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27827 unsigned MaxScale = 64 / MaskEltSize;
27828 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27829 bool Match = true;
27830 unsigned NumDstElts = NumMaskElts / Scale;
27831 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27832 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27833 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27834 }
27835 if (Match) {
27836 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27837 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
27838 MVT::getIntegerVT(MaskEltSize);
27839 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
27840
27841 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
27842 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27843 Shuffle = unsigned(X86ISD::VZEXT);
27844 } else
27845 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27846
27847 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27848 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27849 return true;
27850 }
27851 }
27852 }
27853
27854 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27855 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27856 isUndefOrEqual(Mask[0], 0) &&
27857 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27858 Shuffle = X86ISD::VZEXT_MOVL;
27859 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27860 return true;
27861 }
27862
27863 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27864 // instructions are no slower than UNPCKLPD but has the option to
27865 // fold the input operand into even an unaligned memory load.
27866 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27867 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
27868 Shuffle = X86ISD::MOVDDUP;
27869 SrcVT = DstVT = MVT::v2f64;
27870 return true;
27871 }
27872 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27873 Shuffle = X86ISD::MOVSLDUP;
27874 SrcVT = DstVT = MVT::v4f32;
27875 return true;
27876 }
27877 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27878 Shuffle = X86ISD::MOVSHDUP;
27879 SrcVT = DstVT = MVT::v4f32;
27880 return true;
27881 }
27882 }
27883
27884 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27885 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27885, __extension__ __PRETTY_FUNCTION__))
;
27886 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27887 Shuffle = X86ISD::MOVDDUP;
27888 SrcVT = DstVT = MVT::v4f64;
27889 return true;
27890 }
27891 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27892 Shuffle = X86ISD::MOVSLDUP;
27893 SrcVT = DstVT = MVT::v8f32;
27894 return true;
27895 }
27896 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27897 Shuffle = X86ISD::MOVSHDUP;
27898 SrcVT = DstVT = MVT::v8f32;
27899 return true;
27900 }
27901 }
27902
27903 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27904 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27905, __extension__ __PRETTY_FUNCTION__))
27905 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27905, __extension__ __PRETTY_FUNCTION__))
;
27906 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27907 Shuffle = X86ISD::MOVDDUP;
27908 SrcVT = DstVT = MVT::v8f64;
27909 return true;
27910 }
27911 if (isTargetShuffleEquivalent(
27912 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27913 Shuffle = X86ISD::MOVSLDUP;
27914 SrcVT = DstVT = MVT::v16f32;
27915 return true;
27916 }
27917 if (isTargetShuffleEquivalent(
27918 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27919 Shuffle = X86ISD::MOVSHDUP;
27920 SrcVT = DstVT = MVT::v16f32;
27921 return true;
27922 }
27923 }
27924
27925 // Attempt to match against broadcast-from-vector.
27926 if (Subtarget.hasAVX2()) {
27927 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27928 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27929 SrcVT = DstVT = MaskVT;
27930 Shuffle = X86ISD::VBROADCAST;
27931 return true;
27932 }
27933 }
27934
27935 return false;
27936}
27937
27938// Attempt to match a combined shuffle mask against supported unary immediate
27939// permute instructions.
27940// TODO: Investigate sharing more of this with shuffle lowering.
27941static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27942 const APInt &Zeroable,
27943 bool AllowFloatDomain,
27944 bool AllowIntDomain,
27945 const X86Subtarget &Subtarget,
27946 unsigned &Shuffle, MVT &ShuffleVT,
27947 unsigned &PermuteImm) {
27948 unsigned NumMaskElts = Mask.size();
27949 unsigned InputSizeInBits = MaskVT.getSizeInBits();
27950 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27951 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27952
27953 bool ContainsZeros =
27954 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27955
27956 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27957 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27958 // Check for lane crossing permutes.
27959 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27960 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27961 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27962 Shuffle = X86ISD::VPERMI;
27963 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27964 PermuteImm = getV4X86ShuffleImm(Mask);
27965 return true;
27966 }
27967 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27968 SmallVector<int, 4> RepeatedMask;
27969 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27970 Shuffle = X86ISD::VPERMI;
27971 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27972 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27973 return true;
27974 }
27975 }
27976 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27977 // VPERMILPD can permute with a non-repeating shuffle.
27978 Shuffle = X86ISD::VPERMILPI;
27979 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27980 PermuteImm = 0;
27981 for (int i = 0, e = Mask.size(); i != e; ++i) {
27982 int M = Mask[i];
27983 if (M == SM_SentinelUndef)
27984 continue;
27985 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 27985, __extension__ __PRETTY_FUNCTION__))
;
27986 PermuteImm |= (M & 1) << i;
27987 }
27988 return true;
27989 }
27990 }
27991
27992 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27993 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27994 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27995 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27996 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27997 SmallVector<int, 4> RepeatedMask;
27998 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27999 // Narrow the repeated mask to create 32-bit element permutes.
28000 SmallVector<int, 4> WordMask = RepeatedMask;
28001 if (MaskScalarSizeInBits == 64)
28002 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28003
28004 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28005 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28006 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28007 PermuteImm = getV4X86ShuffleImm(WordMask);
28008 return true;
28009 }
28010 }
28011
28012 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28013 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28014 SmallVector<int, 4> RepeatedMask;
28015 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28016 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28017 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28018
28019 // PSHUFLW: permute lower 4 elements only.
28020 if (isUndefOrInRange(LoMask, 0, 4) &&
28021 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28022 Shuffle = X86ISD::PSHUFLW;
28023 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28024 PermuteImm = getV4X86ShuffleImm(LoMask);
28025 return true;
28026 }
28027
28028 // PSHUFHW: permute upper 4 elements only.
28029 if (isUndefOrInRange(HiMask, 4, 8) &&
28030 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28031 // Offset the HiMask so that we can create the shuffle immediate.
28032 int OffsetHiMask[4];
28033 for (int i = 0; i != 4; ++i)
28034 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28035
28036 Shuffle = X86ISD::PSHUFHW;
28037 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28038 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28039 return true;
28040 }
28041 }
28042 }
28043
28044 // Attempt to match against byte/bit shifts.
28045 // FIXME: Add 512-bit support.
28046 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28047 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28048 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28049 MaskScalarSizeInBits, Mask,
28050 0, Zeroable, Subtarget);
28051 if (0 < ShiftAmt) {
28052 PermuteImm = (unsigned)ShiftAmt;
28053 return true;
28054 }
28055 }
28056
28057 return false;
28058}
28059
28060// Attempt to match a combined unary shuffle mask against supported binary
28061// shuffle instructions.
28062// TODO: Investigate sharing more of this with shuffle lowering.
28063static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28064 bool AllowFloatDomain, bool AllowIntDomain,
28065 SDValue &V1, SDValue &V2, SDLoc &DL,
28066 SelectionDAG &DAG,
28067 const X86Subtarget &Subtarget,
28068 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28069 bool IsUnary) {
28070 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28071
28072 if (MaskVT.is128BitVector()) {
28073 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28074 V2 = V1;
28075 Shuffle = X86ISD::MOVLHPS;
28076 SrcVT = DstVT = MVT::v4f32;
28077 return true;
28078 }
28079 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28080 V2 = V1;
28081 Shuffle = X86ISD::MOVHLPS;
28082 SrcVT = DstVT = MVT::v4f32;
28083 return true;
28084 }
28085 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28086 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28087 std::swap(V1, V2);
28088 Shuffle = X86ISD::MOVSD;
28089 SrcVT = DstVT = MaskVT;
28090 return true;
28091 }
28092 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28093 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28094 Shuffle = X86ISD::MOVSS;
28095 SrcVT = DstVT = MaskVT;
28096 return true;
28097 }
28098 }
28099
28100 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28101 // TODO add support for 256/512-bit types.
28102 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28103 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28104 Subtarget)) {
28105 DstVT = MaskVT;
28106 return true;
28107 }
28108 }
28109
28110 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28111 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28112 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28113 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28114 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28115 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28116 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28117 DAG, Subtarget)) {
28118 SrcVT = DstVT = MaskVT;
28119 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28120 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28121 return true;
28122 }
28123 }
28124
28125 return false;
28126}
28127
28128static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28129 const APInt &Zeroable,
28130 bool AllowFloatDomain,
28131 bool AllowIntDomain,
28132 SDValue &V1, SDValue &V2, SDLoc &DL,
28133 SelectionDAG &DAG,
28134 const X86Subtarget &Subtarget,
28135 unsigned &Shuffle, MVT &ShuffleVT,
28136 unsigned &PermuteImm) {
28137 unsigned NumMaskElts = Mask.size();
28138 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28139
28140 // Attempt to match against PALIGNR byte rotate.
28141 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28142 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28143 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28144 if (0 < ByteRotation) {
28145 Shuffle = X86ISD::PALIGNR;
28146 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28147 PermuteImm = ByteRotation;
28148 return true;
28149 }
28150 }
28151
28152 // Attempt to combine to X86ISD::BLENDI.
28153 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28154 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28155 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28156 uint64_t BlendMask = 0;
28157 bool ForceV1Zero = false, ForceV2Zero = false;
28158 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28159 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28160 BlendMask)) {
28161 if (MaskVT == MVT::v16i16) {
28162 // We can only use v16i16 PBLENDW if the lanes are repeated.
28163 SmallVector<int, 8> RepeatedMask;
28164 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28165 RepeatedMask)) {
28166 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28167, __extension__ __PRETTY_FUNCTION__))
28167 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28167, __extension__ __PRETTY_FUNCTION__))
;
28168 PermuteImm = 0;
28169 for (int i = 0; i < 8; ++i)
28170 if (RepeatedMask[i] >= 8)
28171 PermuteImm |= 1 << i;
28172 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28173 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28174 Shuffle = X86ISD::BLENDI;
28175 ShuffleVT = MaskVT;
28176 return true;
28177 }
28178 } else {
28179 // Determine a type compatible with X86ISD::BLENDI.
28180 ShuffleVT = MaskVT;
28181 if (Subtarget.hasAVX2()) {
28182 if (ShuffleVT == MVT::v4i64)
28183 ShuffleVT = MVT::v8i32;
28184 else if (ShuffleVT == MVT::v2i64)
28185 ShuffleVT = MVT::v4i32;
28186 } else {
28187 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28188 ShuffleVT = MVT::v8i16;
28189 else if (ShuffleVT == MVT::v4i64)
28190 ShuffleVT = MVT::v4f64;
28191 else if (ShuffleVT == MVT::v8i32)
28192 ShuffleVT = MVT::v8f32;
28193 }
28194
28195 if (!ShuffleVT.isFloatingPoint()) {
28196 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28197 BlendMask =
28198 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28199 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28200 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28201 }
28202
28203 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28204 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28205 PermuteImm = (unsigned)BlendMask;
28206 Shuffle = X86ISD::BLENDI;
28207 return true;
28208 }
28209 }
28210 }
28211
28212 // Attempt to combine to INSERTPS.
28213 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28214 MaskVT.is128BitVector()) {
28215 if (Zeroable.getBoolValue() &&
28216 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28217 Shuffle = X86ISD::INSERTPS;
28218 ShuffleVT = MVT::v4f32;
28219 return true;
28220 }
28221 }
28222
28223 // Attempt to combine to SHUFPD.
28224 if (AllowFloatDomain && EltSizeInBits == 64 &&
28225 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28226 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28227 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28228 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28229 Shuffle = X86ISD::SHUFP;
28230 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28231 return true;
28232 }
28233 }
28234
28235 // Attempt to combine to SHUFPS.
28236 if (AllowFloatDomain && EltSizeInBits == 32 &&
28237 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28238 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28239 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28240 SmallVector<int, 4> RepeatedMask;
28241 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28242 // Match each half of the repeated mask, to determine if its just
28243 // referencing one of the vectors, is zeroable or entirely undef.
28244 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28245 int M0 = RepeatedMask[Offset];
28246 int M1 = RepeatedMask[Offset + 1];
28247
28248 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28249 return DAG.getUNDEF(MaskVT);
28250 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28251 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28252 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28253 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28254 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28255 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28256 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28257 return V1;
28258 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28259 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28260 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28261 return V2;
28262 }
28263
28264 return SDValue();
28265 };
28266
28267 int ShufMask[4] = {-1, -1, -1, -1};
28268 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28269 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28270
28271 if (Lo && Hi) {
28272 V1 = Lo;
28273 V2 = Hi;
28274 Shuffle = X86ISD::SHUFP;
28275 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28276 PermuteImm = getV4X86ShuffleImm(ShufMask);
28277 return true;
28278 }
28279 }
28280 }
28281
28282 return false;
28283}
28284
28285/// \brief Combine an arbitrary chain of shuffles into a single instruction if
28286/// possible.
28287///
28288/// This is the leaf of the recursive combine below. When we have found some
28289/// chain of single-use x86 shuffle instructions and accumulated the combined
28290/// shuffle mask represented by them, this will try to pattern match that mask
28291/// into either a single instruction if there is a special purpose instruction
28292/// for this operation, or into a PSHUFB instruction which is a fully general
28293/// instruction but should only be used to replace chains over a certain depth.
28294static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28295 ArrayRef<int> BaseMask, int Depth,
28296 bool HasVariableMask, SelectionDAG &DAG,
28297 TargetLowering::DAGCombinerInfo &DCI,
28298 const X86Subtarget &Subtarget) {
28299 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28299, __extension__ __PRETTY_FUNCTION__))
;
28300 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28301, __extension__ __PRETTY_FUNCTION__))
28301 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28301, __extension__ __PRETTY_FUNCTION__))
;
28302
28303 // Find the inputs that enter the chain. Note that multiple uses are OK
28304 // here, we're not going to remove the operands we find.
28305 bool UnaryShuffle = (Inputs.size() == 1);
28306 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28307 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28308 : peekThroughBitcasts(Inputs[1]));
28309
28310 MVT VT1 = V1.getSimpleValueType();
28311 MVT VT2 = V2.getSimpleValueType();
28312 MVT RootVT = Root.getSimpleValueType();
28313 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__))
28314 VT2.getSizeInBits() == RootVT.getSizeInBits() &&(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__))
28315 "Vector size mismatch")(static_cast <bool> (VT1.getSizeInBits() == RootVT.getSizeInBits
() && VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch") ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28315, __extension__ __PRETTY_FUNCTION__))
;
28316
28317 SDLoc DL(Root);
28318 SDValue Res;
28319
28320 unsigned NumBaseMaskElts = BaseMask.size();
28321 if (NumBaseMaskElts == 1) {
28322 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28322, __extension__ __PRETTY_FUNCTION__))
;
28323 return DAG.getBitcast(RootVT, V1);
28324 }
28325
28326 unsigned RootSizeInBits = RootVT.getSizeInBits();
28327 unsigned NumRootElts = RootVT.getVectorNumElements();
28328 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28329 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28330 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28331
28332 // Don't combine if we are a AVX512/EVEX target and the mask element size
28333 // is different from the root element size - this would prevent writemasks
28334 // from being reused.
28335 // TODO - this currently prevents all lane shuffles from occurring.
28336 // TODO - check for writemasks usage instead of always preventing combining.
28337 // TODO - attempt to narrow Mask back to writemask size.
28338 bool IsEVEXShuffle =
28339 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28340 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
28341 return SDValue();
28342
28343 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28344
28345 // Handle 128-bit lane shuffles of 256-bit vectors.
28346 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28347 // we need to use the zeroing feature.
28348 // TODO - this should support binary shuffles.
28349 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28350 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28351 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28352 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28353 return SDValue(); // Nothing to do!
28354 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28355 unsigned PermMask = 0;
28356 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28357 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28358
28359 Res = DAG.getBitcast(ShuffleVT, V1);
28360 DCI.AddToWorklist(Res.getNode());
28361 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28362 DAG.getUNDEF(ShuffleVT),
28363 DAG.getConstant(PermMask, DL, MVT::i8));
28364 DCI.AddToWorklist(Res.getNode());
28365 return DAG.getBitcast(RootVT, Res);
28366 }
28367
28368 // For masks that have been widened to 128-bit elements or more,
28369 // narrow back down to 64-bit elements.
28370 SmallVector<int, 64> Mask;
28371 if (BaseMaskEltSizeInBits > 64) {
28372 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28372, __extension__ __PRETTY_FUNCTION__))
;
28373 int MaskScale = BaseMaskEltSizeInBits / 64;
28374 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28375 } else {
28376 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28377 }
28378
28379 unsigned NumMaskElts = Mask.size();
28380 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28381
28382 // Determine the effective mask value type.
28383 FloatDomain &= (32 <= MaskEltSizeInBits);
28384 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28385 : MVT::getIntegerVT(MaskEltSizeInBits);
28386 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28387
28388 // Only allow legal mask types.
28389 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28390 return SDValue();
28391
28392 // Attempt to match the mask against known shuffle patterns.
28393 MVT ShuffleSrcVT, ShuffleVT;
28394 unsigned Shuffle, PermuteImm;
28395
28396 // Which shuffle domains are permitted?
28397 // Permit domain crossing at higher combine depths.
28398 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28399 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28400 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28401
28402 // Determine zeroable mask elements.
28403 APInt Zeroable(NumMaskElts, 0);
28404 for (unsigned i = 0; i != NumMaskElts; ++i)
28405 if (isUndefOrZero(Mask[i]))
28406 Zeroable.setBit(i);
28407
28408 if (UnaryShuffle) {
28409 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28410 // directly if we don't shuffle the lower element and we shuffle the upper
28411 // (zero) elements within themselves.
28412 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28413 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28414 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28415 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28416 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28417 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28418 return DAG.getBitcast(RootVT, V1);
28419 }
28420 }
28421
28422 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28423 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28424 ShuffleVT)) {
28425 if (Depth == 1 && Root.getOpcode() == Shuffle)
28426 return SDValue(); // Nothing to do!
28427 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28428 return SDValue(); // AVX512 Writemask clash.
28429 Res = DAG.getBitcast(ShuffleSrcVT, V1);
28430 DCI.AddToWorklist(Res.getNode());
28431 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28432 DCI.AddToWorklist(Res.getNode());
28433 return DAG.getBitcast(RootVT, Res);
28434 }
28435
28436 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28437 AllowIntDomain, Subtarget, Shuffle,
28438 ShuffleVT, PermuteImm)) {
28439 if (Depth == 1 && Root.getOpcode() == Shuffle)
28440 return SDValue(); // Nothing to do!
28441 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28442 return SDValue(); // AVX512 Writemask clash.
28443 Res = DAG.getBitcast(ShuffleVT, V1);
28444 DCI.AddToWorklist(Res.getNode());
28445 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28446 DAG.getConstant(PermuteImm, DL, MVT::i8));
28447 DCI.AddToWorklist(Res.getNode());
28448 return DAG.getBitcast(RootVT, Res);
28449 }
28450 }
28451
28452 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28453 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28454 ShuffleVT, UnaryShuffle)) {
28455 if (Depth == 1 && Root.getOpcode() == Shuffle)
28456 return SDValue(); // Nothing to do!
28457 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28458 return SDValue(); // AVX512 Writemask clash.
28459 V1 = DAG.getBitcast(ShuffleSrcVT, V1);
28460 DCI.AddToWorklist(V1.getNode());
28461 V2 = DAG.getBitcast(ShuffleSrcVT, V2);
28462 DCI.AddToWorklist(V2.getNode());
28463 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
28464 DCI.AddToWorklist(Res.getNode());
28465 return DAG.getBitcast(RootVT, Res);
28466 }
28467
28468 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28469 AllowIntDomain, V1, V2, DL, DAG,
28470 Subtarget, Shuffle, ShuffleVT,
28471 PermuteImm)) {
28472 if (Depth == 1 && Root.getOpcode() == Shuffle)
28473 return SDValue(); // Nothing to do!
28474 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28475 return SDValue(); // AVX512 Writemask clash.
28476 V1 = DAG.getBitcast(ShuffleVT, V1);
28477 DCI.AddToWorklist(V1.getNode());
28478 V2 = DAG.getBitcast(ShuffleVT, V2);
28479 DCI.AddToWorklist(V2.getNode());
28480 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
28481 DAG.getConstant(PermuteImm, DL, MVT::i8));
28482 DCI.AddToWorklist(Res.getNode());
28483 return DAG.getBitcast(RootVT, Res);
28484 }
28485
28486 // Typically from here on, we need an integer version of MaskVT.
28487 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28488 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28489
28490 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28491 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28492 uint64_t BitLen, BitIdx;
28493 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28494 Zeroable)) {
28495 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28496 return SDValue(); // Nothing to do!
28497 V1 = DAG.getBitcast(IntMaskVT, V1);
28498 DCI.AddToWorklist(V1.getNode());
28499 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28500 DAG.getConstant(BitLen, DL, MVT::i8),
28501 DAG.getConstant(BitIdx, DL, MVT::i8));
28502 DCI.AddToWorklist(Res.getNode());
28503 return DAG.getBitcast(RootVT, Res);
28504 }
28505
28506 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28507 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28508 return SDValue(); // Nothing to do!
28509 V1 = DAG.getBitcast(IntMaskVT, V1);
28510 DCI.AddToWorklist(V1.getNode());
28511 V2 = DAG.getBitcast(IntMaskVT, V2);
28512 DCI.AddToWorklist(V2.getNode());
28513 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28514 DAG.getConstant(BitLen, DL, MVT::i8),
28515 DAG.getConstant(BitIdx, DL, MVT::i8));
28516 DCI.AddToWorklist(Res.getNode());
28517 return DAG.getBitcast(RootVT, Res);
28518 }
28519 }
28520
28521 // Don't try to re-form single instruction chains under any circumstances now
28522 // that we've done encoding canonicalization for them.
28523 if (Depth < 2)
28524 return SDValue();
28525
28526 // Depth threshold above which we can efficiently use variable mask shuffles.
28527 // TODO This should probably be target specific.
28528 bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
28529
28530 bool MaskContainsZeros =
28531 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28532
28533 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28534 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28535 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28536 ((Subtarget.hasAVX2() &&
28537 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28538 (Subtarget.hasAVX512() &&
28539 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28540 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28541 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28542 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28543 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28544 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28545 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28546 DCI.AddToWorklist(VPermMask.getNode());
28547 Res = DAG.getBitcast(MaskVT, V1);
28548 DCI.AddToWorklist(Res.getNode());
28549 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28550 DCI.AddToWorklist(Res.getNode());
28551 return DAG.getBitcast(RootVT, Res);
28552 }
28553
28554 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28555 // vector as the second source.
28556 if (UnaryShuffle && AllowVariableMask &&
28557 ((Subtarget.hasAVX512() &&
28558 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28559 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28560 (Subtarget.hasVLX() &&
28561 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28562 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28563 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28564 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28565 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28566 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28567 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28568 for (unsigned i = 0; i != NumMaskElts; ++i)
28569 if (Mask[i] == SM_SentinelZero)
28570 Mask[i] = NumMaskElts + i;
28571
28572 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28573 DCI.AddToWorklist(VPermMask.getNode());
28574 Res = DAG.getBitcast(MaskVT, V1);
28575 DCI.AddToWorklist(Res.getNode());
28576 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28577 DCI.AddToWorklist(Zero.getNode());
28578 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28579 DCI.AddToWorklist(Res.getNode());
28580 return DAG.getBitcast(RootVT, Res);
28581 }
28582
28583 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28584 if (AllowVariableMask && !MaskContainsZeros &&
28585 ((Subtarget.hasAVX512() &&
28586 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28587 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28588 (Subtarget.hasVLX() &&
28589 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28590 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28591 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28592 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28593 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28594 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28595 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28596 DCI.AddToWorklist(VPermMask.getNode());
28597 V1 = DAG.getBitcast(MaskVT, V1);
28598 DCI.AddToWorklist(V1.getNode());
28599 V2 = DAG.getBitcast(MaskVT, V2);
28600 DCI.AddToWorklist(V2.getNode());
28601 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
28602 DCI.AddToWorklist(Res.getNode());
28603 return DAG.getBitcast(RootVT, Res);
28604 }
28605 return SDValue();
28606 }
28607
28608 // See if we can combine a single input shuffle with zeros to a bit-mask,
28609 // which is much simpler than any shuffle.
28610 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
28611 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
28612 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
28613 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
28614 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
28615 APInt UndefElts(NumMaskElts, 0);
28616 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
28617 for (unsigned i = 0; i != NumMaskElts; ++i) {
28618 int M = Mask[i];
28619 if (M == SM_SentinelUndef) {
28620 UndefElts.setBit(i);
28621 continue;
28622 }
28623 if (M == SM_SentinelZero)
28624 continue;
28625 EltBits[i] = AllOnes;
28626 }
28627 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
28628 DCI.AddToWorklist(BitMask.getNode());
28629 Res = DAG.getBitcast(MaskVT, V1);
28630 DCI.AddToWorklist(Res.getNode());
28631 unsigned AndOpcode =
28632 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
28633 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
28634 DCI.AddToWorklist(Res.getNode());
28635 return DAG.getBitcast(RootVT, Res);
28636 }
28637
28638 // If we have a single input shuffle with different shuffle patterns in the
28639 // the 128-bit lanes use the variable mask to VPERMILPS.
28640 // TODO Combine other mask types at higher depths.
28641 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28642 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
28643 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
28644 SmallVector<SDValue, 16> VPermIdx;
28645 for (int M : Mask) {
28646 SDValue Idx =
28647 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
28648 VPermIdx.push_back(Idx);
28649 }
28650 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
28651 DCI.AddToWorklist(VPermMask.getNode());
28652 Res = DAG.getBitcast(MaskVT, V1);
28653 DCI.AddToWorklist(Res.getNode());
28654 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
28655 DCI.AddToWorklist(Res.getNode());
28656 return DAG.getBitcast(RootVT, Res);
28657 }
28658
28659 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
28660 // to VPERMIL2PD/VPERMIL2PS.
28661 if (AllowVariableMask && Subtarget.hasXOP() &&
28662 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
28663 MaskVT == MVT::v8f32)) {
28664 // VPERMIL2 Operation.
28665 // Bits[3] - Match Bit.
28666 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
28667 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
28668 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
28669 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
28670 SmallVector<int, 8> VPerm2Idx;
28671 unsigned M2ZImm = 0;
28672 for (int M : Mask) {
28673 if (M == SM_SentinelUndef) {
28674 VPerm2Idx.push_back(-1);
28675 continue;
28676 }
28677 if (M == SM_SentinelZero) {
28678 M2ZImm = 2;
28679 VPerm2Idx.push_back(8);
28680 continue;
28681 }
28682 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
28683 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
28684 VPerm2Idx.push_back(Index);
28685 }
28686 V1 = DAG.getBitcast(MaskVT, V1);
28687 DCI.AddToWorklist(V1.getNode());
28688 V2 = DAG.getBitcast(MaskVT, V2);
28689 DCI.AddToWorklist(V2.getNode());
28690 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
28691 DCI.AddToWorklist(VPerm2MaskOp.getNode());
28692 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
28693 DAG.getConstant(M2ZImm, DL, MVT::i8));
28694 DCI.AddToWorklist(Res.getNode());
28695 return DAG.getBitcast(RootVT, Res);
28696 }
28697
28698 // If we have 3 or more shuffle instructions or a chain involving a variable
28699 // mask, we can replace them with a single PSHUFB instruction profitably.
28700 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
28701 // instructions, but in practice PSHUFB tends to be *very* fast so we're
28702 // more aggressive.
28703 if (UnaryShuffle && AllowVariableMask &&
28704 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28705 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
28706 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
28707 SmallVector<SDValue, 16> PSHUFBMask;
28708 int NumBytes = RootVT.getSizeInBits() / 8;
28709 int Ratio = NumBytes / NumMaskElts;
28710 for (int i = 0; i < NumBytes; ++i) {
28711 int M = Mask[i / Ratio];
28712 if (M == SM_SentinelUndef) {
28713 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28714 continue;
28715 }
28716 if (M == SM_SentinelZero) {
28717 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28718 continue;
28719 }
28720 M = Ratio * M + i % Ratio;
28721 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28721, __extension__ __PRETTY_FUNCTION__))
;
28722 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28723 }
28724 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28725 Res = DAG.getBitcast(ByteVT, V1);
28726 DCI.AddToWorklist(Res.getNode());
28727 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28728 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28729 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28730 DCI.AddToWorklist(Res.getNode());
28731 return DAG.getBitcast(RootVT, Res);
28732 }
28733
28734 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28735 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28736 // slower than PSHUFB on targets that support both.
28737 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
28738 // VPPERM Mask Operation
28739 // Bits[4:0] - Byte Index (0 - 31)
28740 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28741 SmallVector<SDValue, 16> VPPERMMask;
28742 int NumBytes = 16;
28743 int Ratio = NumBytes / NumMaskElts;
28744 for (int i = 0; i < NumBytes; ++i) {
28745 int M = Mask[i / Ratio];
28746 if (M == SM_SentinelUndef) {
28747 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28748 continue;
28749 }
28750 if (M == SM_SentinelZero) {
28751 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28752 continue;
28753 }
28754 M = Ratio * M + i % Ratio;
28755 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28756 }
28757 MVT ByteVT = MVT::v16i8;
28758 V1 = DAG.getBitcast(ByteVT, V1);
28759 DCI.AddToWorklist(V1.getNode());
28760 V2 = DAG.getBitcast(ByteVT, V2);
28761 DCI.AddToWorklist(V2.getNode());
28762 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28763 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28764 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28765 DCI.AddToWorklist(Res.getNode());
28766 return DAG.getBitcast(RootVT, Res);
28767 }
28768
28769 // Failed to find any combines.
28770 return SDValue();
28771}
28772
28773// Attempt to constant fold all of the constant source ops.
28774// Returns true if the entire shuffle is folded to a constant.
28775// TODO: Extend this to merge multiple constant Ops and update the mask.
28776static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28777 ArrayRef<int> Mask, SDValue Root,
28778 bool HasVariableMask,
28779 SelectionDAG &DAG,
28780 TargetLowering::DAGCombinerInfo &DCI,
28781 const X86Subtarget &Subtarget) {
28782 MVT VT = Root.getSimpleValueType();
28783
28784 unsigned SizeInBits = VT.getSizeInBits();
28785 unsigned NumMaskElts = Mask.size();
28786 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28787 unsigned NumOps = Ops.size();
28788
28789 // Extract constant bits from each source op.
28790 bool OneUseConstantOp = false;
28791 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28792 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28793 for (unsigned i = 0; i != NumOps; ++i) {
28794 SDValue SrcOp = Ops[i];
28795 OneUseConstantOp |= SrcOp.hasOneUse();
28796 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28797 RawBitsOps[i]))
28798 return SDValue();
28799 }
28800
28801 // Only fold if at least one of the constants is only used once or
28802 // the combined shuffle has included a variable mask shuffle, this
28803 // is to avoid constant pool bloat.
28804 if (!OneUseConstantOp && !HasVariableMask)
28805 return SDValue();
28806
28807 // Shuffle the constant bits according to the mask.
28808 APInt UndefElts(NumMaskElts, 0);
28809 APInt ZeroElts(NumMaskElts, 0);
28810 APInt ConstantElts(NumMaskElts, 0);
28811 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28812 APInt::getNullValue(MaskSizeInBits));
28813 for (unsigned i = 0; i != NumMaskElts; ++i) {
28814 int M = Mask[i];
28815 if (M == SM_SentinelUndef) {
28816 UndefElts.setBit(i);
28817 continue;
28818 } else if (M == SM_SentinelZero) {
28819 ZeroElts.setBit(i);
28820 continue;
28821 }
28822 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28822, __extension__ __PRETTY_FUNCTION__))
;
28823
28824 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28825 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28826
28827 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28828 if (SrcUndefElts[SrcMaskIdx]) {
28829 UndefElts.setBit(i);
28830 continue;
28831 }
28832
28833 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28834 APInt &Bits = SrcEltBits[SrcMaskIdx];
28835 if (!Bits) {
28836 ZeroElts.setBit(i);
28837 continue;
28838 }
28839
28840 ConstantElts.setBit(i);
28841 ConstantBitData[i] = Bits;
28842 }
28843 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnesValue()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28843, __extension__ __PRETTY_FUNCTION__))
;
28844
28845 // Create the constant data.
28846 MVT MaskSVT;
28847 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28848 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28849 else
28850 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28851
28852 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28853
28854 SDLoc DL(Root);
28855 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28856 DCI.AddToWorklist(CstOp.getNode());
28857 return DAG.getBitcast(VT, CstOp);
28858}
28859
28860/// \brief Fully generic combining of x86 shuffle instructions.
28861///
28862/// This should be the last combine run over the x86 shuffle instructions. Once
28863/// they have been fully optimized, this will recursively consider all chains
28864/// of single-use shuffle instructions, build a generic model of the cumulative
28865/// shuffle operation, and check for simpler instructions which implement this
28866/// operation. We use this primarily for two purposes:
28867///
28868/// 1) Collapse generic shuffles to specialized single instructions when
28869/// equivalent. In most cases, this is just an encoding size win, but
28870/// sometimes we will collapse multiple generic shuffles into a single
28871/// special-purpose shuffle.
28872/// 2) Look for sequences of shuffle instructions with 3 or more total
28873/// instructions, and replace them with the slightly more expensive SSSE3
28874/// PSHUFB instruction if available. We do this as the last combining step
28875/// to ensure we avoid using PSHUFB if we can implement the shuffle with
28876/// a suitable short sequence of other instructions. The PSHUFB will either
28877/// use a register or have to read from memory and so is slightly (but only
28878/// slightly) more expensive than the other shuffle instructions.
28879///
28880/// Because this is inherently a quadratic operation (for each shuffle in
28881/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28882/// This should never be an issue in practice as the shuffle lowering doesn't
28883/// produce sequences of more than 8 instructions.
28884///
28885/// FIXME: We will currently miss some cases where the redundant shuffling
28886/// would simplify under the threshold for PSHUFB formation because of
28887/// combine-ordering. To fix this, we should do the redundant instruction
28888/// combining in this recursive walk.
28889static SDValue combineX86ShufflesRecursively(
28890 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
28891 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
28892 bool HasVariableMask, SelectionDAG &DAG,
28893 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
28894 // Bound the depth of our recursive combine because this is ultimately
28895 // quadratic in nature.
28896 if (Depth > 8)
28897 return SDValue();
28898
28899 // Directly rip through bitcasts to find the underlying operand.
28900 SDValue Op = SrcOps[SrcOpIndex];
28901 Op = peekThroughOneUseBitcasts(Op);
28902
28903 MVT VT = Op.getSimpleValueType();
28904 if (!VT.isVector())
28905 return SDValue(); // Bail if we hit a non-vector.
28906
28907 assert(Root.getSimpleValueType().isVector() &&(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28908, __extension__ __PRETTY_FUNCTION__))
28908 "Shuffles operate on vector types!")(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28908, __extension__ __PRETTY_FUNCTION__))
;
28909 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == Root.getSimpleValueType
().getSizeInBits() && "Can only combine shuffles of the same vector register size."
) ? void (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28910, __extension__ __PRETTY_FUNCTION__))
28910 "Can only combine shuffles of the same vector register size.")(static_cast <bool> (VT.getSizeInBits() == Root.getSimpleValueType
().getSizeInBits() && "Can only combine shuffles of the same vector register size."
) ? void (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28910, __extension__ __PRETTY_FUNCTION__))
;
28911
28912 // Extract target shuffle mask and resolve sentinels and inputs.
28913 SmallVector<int, 64> OpMask;
28914 SmallVector<SDValue, 2> OpInputs;
28915 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28916 return SDValue();
28917
28918 assert(OpInputs.size() <= 2 && "Too many shuffle inputs")(static_cast <bool> (OpInputs.size() <= 2 &&
"Too many shuffle inputs") ? void (0) : __assert_fail ("OpInputs.size() <= 2 && \"Too many shuffle inputs\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28918, __extension__ __PRETTY_FUNCTION__))
;
28919 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28920 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28921
28922 // Add the inputs to the Ops list, avoiding duplicates.
28923 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28924
28925 int InputIdx0 = -1, InputIdx1 = -1;
28926 for (int i = 0, e = Ops.size(); i < e; ++i) {
28927 SDValue BC = peekThroughBitcasts(Ops[i]);
28928 if (Input0 && BC == peekThroughBitcasts(Input0))
28929 InputIdx0 = i;
28930 if (Input1 && BC == peekThroughBitcasts(Input1))
28931 InputIdx1 = i;
28932 }
28933
28934 if (Input0 && InputIdx0 < 0) {
28935 InputIdx0 = SrcOpIndex;
28936 Ops[SrcOpIndex] = Input0;
28937 }
28938 if (Input1 && InputIdx1 < 0) {
28939 InputIdx1 = Ops.size();
28940 Ops.push_back(Input1);
28941 }
28942
28943 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
28944 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
28945 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
28946 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
28947 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
28948 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __extension__ __PRETTY_FUNCTION__))
;
28949
28950 // This function can be performance-critical, so we rely on the power-of-2
28951 // knowledge that we have about the mask sizes to replace div/rem ops with
28952 // bit-masks and shifts.
28953 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28953, __extension__ __PRETTY_FUNCTION__))
;
28954 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28954, __extension__ __PRETTY_FUNCTION__))
;
28955 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28956 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28957
28958 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28959 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28960 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28961 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28962, __extension__ __PRETTY_FUNCTION__))
28962 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28962, __extension__ __PRETTY_FUNCTION__))
;
28963
28964 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28964, __extension__ __PRETTY_FUNCTION__))
;
28965 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28965, __extension__ __PRETTY_FUNCTION__))
;
28966 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 28966, __extension__ __PRETTY_FUNCTION__))
;
28967 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28968 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28969
28970 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28971
28972 // Merge this shuffle operation's mask into our accumulated mask. Note that
28973 // this shuffle's mask will be the first applied to the input, followed by the
28974 // root mask to get us all the way to the root value arrangement. The reason
28975 // for this order is that we are recursing up the operation chain.
28976 for (unsigned i = 0; i < MaskWidth; ++i) {
28977 unsigned RootIdx = i >> RootRatioLog2;
28978 if (RootMask[RootIdx] < 0) {
28979 // This is a zero or undef lane, we're done.
28980 Mask[i] = RootMask[RootIdx];
28981 continue;
28982 }
28983
28984 unsigned RootMaskedIdx =
28985 RootRatio == 1
28986 ? RootMask[RootIdx]
28987 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28988
28989 // Just insert the scaled root mask value if it references an input other
28990 // than the SrcOp we're currently inserting.
28991 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28992 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28993 Mask[i] = RootMaskedIdx;
28994 continue;
28995 }
28996
28997 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28998 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28999 if (OpMask[OpIdx] < 0) {
29000 // The incoming lanes are zero or undef, it doesn't matter which ones we
29001 // are using.
29002 Mask[i] = OpMask[OpIdx];
29003 continue;
29004 }
29005
29006 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29007 unsigned OpMaskedIdx =
29008 OpRatio == 1
29009 ? OpMask[OpIdx]
29010 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29011
29012 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29013 if (OpMask[OpIdx] < (int)OpMask.size()) {
29014 assert(0 <= InputIdx0 && "Unknown target shuffle input")(static_cast <bool> (0 <= InputIdx0 && "Unknown target shuffle input"
) ? void (0) : __assert_fail ("0 <= InputIdx0 && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29014, __extension__ __PRETTY_FUNCTION__))
;
29015 OpMaskedIdx += InputIdx0 * MaskWidth;
29016 } else {
29017 assert(0 <= InputIdx1 && "Unknown target shuffle input")(static_cast <bool> (0 <= InputIdx1 && "Unknown target shuffle input"
) ? void (0) : __assert_fail ("0 <= InputIdx1 && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29017, __extension__ __PRETTY_FUNCTION__))
;
29018 OpMaskedIdx += InputIdx1 * MaskWidth;
29019 }
29020
29021 Mask[i] = OpMaskedIdx;
29022 }
29023
29024 // Handle the all undef/zero cases early.
29025 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29026 return DAG.getUNDEF(Root.getValueType());
29027
29028 // TODO - should we handle the mixed zero/undef case as well? Just returning
29029 // a zero mask will lose information on undef elements possibly reducing
29030 // future combine possibilities.
29031 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29032 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29033 SDLoc(Root));
29034
29035 // Remove unused shuffle source ops.
29036 resolveTargetShuffleInputsAndMask(Ops, Mask);
29037 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29037, __extension__ __PRETTY_FUNCTION__))
;
29038
29039 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29040
29041 // Update the list of shuffle nodes that have been combined so far.
29042 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29043 SrcNodes.end());
29044 CombinedNodes.push_back(Op.getNode());
29045
29046 // See if we can recurse into each shuffle source op (if it's a target
29047 // shuffle). The source op should only be combined if it either has a
29048 // single use (i.e. current Op) or all its users have already been combined.
29049 for (int i = 0, e = Ops.size(); i < e; ++i)
29050 if (Ops[i].getNode()->hasOneUse() ||
29051 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29052 if (SDValue Res = combineX86ShufflesRecursively(
29053 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29054 DAG, DCI, Subtarget))
29055 return Res;
29056
29057 // Attempt to constant fold all of the constant source ops.
29058 if (SDValue Cst = combineX86ShufflesConstants(
29059 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29060 return Cst;
29061
29062 // We can only combine unary and binary shuffle mask cases.
29063 if (Ops.size() > 2)
29064 return SDValue();
29065
29066 // Minor canonicalization of the accumulated shuffle mask to make it easier
29067 // to match below. All this does is detect masks with sequential pairs of
29068 // elements, and shrink them to the half-width mask. It does this in a loop
29069 // so it will reduce the size of the mask to the minimal width mask which
29070 // performs an equivalent shuffle.
29071 SmallVector<int, 64> WidenedMask;
29072 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29073 Mask = std::move(WidenedMask);
29074 }
29075
29076 // Canonicalization of binary shuffle masks to improve pattern matching by
29077 // commuting the inputs.
29078 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29079 ShuffleVectorSDNode::commuteMask(Mask);
29080 std::swap(Ops[0], Ops[1]);
29081 }
29082
29083 // Finally, try to combine into a single shuffle instruction.
29084 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29085 DCI, Subtarget);
29086}
29087
29088/// \brief Get the PSHUF-style mask from PSHUF node.
29089///
29090/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29091/// PSHUF-style masks that can be reused with such instructions.
29092static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29093 MVT VT = N.getSimpleValueType();
29094 SmallVector<int, 4> Mask;
29095 SmallVector<SDValue, 2> Ops;
29096 bool IsUnary;
29097 bool HaveMask =
29098 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29099 (void)HaveMask;
29100 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29100, __extension__ __PRETTY_FUNCTION__))
;
29101
29102 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29103 // matter. Check that the upper masks are repeats and remove them.
29104 if (VT.getSizeInBits() > 128) {
29105 int LaneElts = 128 / VT.getScalarSizeInBits();
29106#ifndef NDEBUG
29107 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29108 for (int j = 0; j < LaneElts; ++j)
29109 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29110, __extension__ __PRETTY_FUNCTION__))
29110 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29110, __extension__ __PRETTY_FUNCTION__))
;
29111#endif
29112 Mask.resize(LaneElts);
29113 }
29114
29115 switch (N.getOpcode()) {
29116 case X86ISD::PSHUFD:
29117 return Mask;
29118 case X86ISD::PSHUFLW:
29119 Mask.resize(4);
29120 return Mask;
29121 case X86ISD::PSHUFHW:
29122 Mask.erase(Mask.begin(), Mask.begin() + 4);
29123 for (int &M : Mask)
29124 M -= 4;
29125 return Mask;
29126 default:
29127 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29127)
;
29128 }
29129}
29130
29131/// \brief Search for a combinable shuffle across a chain ending in pshufd.
29132///
29133/// We walk up the chain and look for a combinable shuffle, skipping over
29134/// shuffles that we could hoist this shuffle's transformation past without
29135/// altering anything.
29136static SDValue
29137combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29138 SelectionDAG &DAG) {
29139 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29140, __extension__ __PRETTY_FUNCTION__))
29140 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29140, __extension__ __PRETTY_FUNCTION__))
;
29141 SDLoc DL(N);
29142
29143 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29144 // of the shuffles in the chain so that we can form a fresh chain to replace
29145 // this one.
29146 SmallVector<SDValue, 8> Chain;
29147 SDValue V = N.getOperand(0);
29148 for (; V.hasOneUse(); V = V.getOperand(0)) {
29149 switch (V.getOpcode()) {
29150 default:
29151 return SDValue(); // Nothing combined!
29152
29153 case ISD::BITCAST:
29154 // Skip bitcasts as we always know the type for the target specific
29155 // instructions.
29156 continue;
29157
29158 case X86ISD::PSHUFD:
29159 // Found another dword shuffle.
29160 break;
29161
29162 case X86ISD::PSHUFLW:
29163 // Check that the low words (being shuffled) are the identity in the
29164 // dword shuffle, and the high words are self-contained.
29165 if (Mask[0] != 0 || Mask[1] != 1 ||
29166 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29167 return SDValue();
29168
29169 Chain.push_back(V);
29170 continue;
29171
29172 case X86ISD::PSHUFHW:
29173 // Check that the high words (being shuffled) are the identity in the
29174 // dword shuffle, and the low words are self-contained.
29175 if (Mask[2] != 2 || Mask[3] != 3 ||
29176 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29177 return SDValue();
29178
29179 Chain.push_back(V);
29180 continue;
29181
29182 case X86ISD::UNPCKL:
29183 case X86ISD::UNPCKH:
29184 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29185 // shuffle into a preceding word shuffle.
29186 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29187 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29188 return SDValue();
29189
29190 // Search for a half-shuffle which we can combine with.
29191 unsigned CombineOp =
29192 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29193 if (V.getOperand(0) != V.getOperand(1) ||
29194 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29195 return SDValue();
29196 Chain.push_back(V);
29197 V = V.getOperand(0);
29198 do {
29199 switch (V.getOpcode()) {
29200 default:
29201 return SDValue(); // Nothing to combine.
29202
29203 case X86ISD::PSHUFLW:
29204 case X86ISD::PSHUFHW:
29205 if (V.getOpcode() == CombineOp)
29206 break;
29207
29208 Chain.push_back(V);
29209
29210 LLVM_FALLTHROUGH[[clang::fallthrough]];
29211 case ISD::BITCAST:
29212 V = V.getOperand(0);
29213 continue;
29214 }
29215 break;
29216 } while (V.hasOneUse());
29217 break;
29218 }
29219 // Break out of the loop if we break out of the switch.
29220 break;
29221 }
29222
29223 if (!V.hasOneUse())
29224 // We fell out of the loop without finding a viable combining instruction.
29225 return SDValue();
29226
29227 // Merge this node's mask and our incoming mask.
29228 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29229 for (int &M : Mask)
29230 M = VMask[M];
29231 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29232 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29233
29234 // Rebuild the chain around this new shuffle.
29235 while (!Chain.empty()) {
29236 SDValue W = Chain.pop_back_val();
29237
29238 if (V.getValueType() != W.getOperand(0).getValueType())
29239 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29240
29241 switch (W.getOpcode()) {
29242 default:
29243 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29243)
;
29244
29245 case X86ISD::UNPCKL:
29246 case X86ISD::UNPCKH:
29247 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29248 break;
29249
29250 case X86ISD::PSHUFD:
29251 case X86ISD::PSHUFLW:
29252 case X86ISD::PSHUFHW:
29253 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29254 break;
29255 }
29256 }
29257 if (V.getValueType() != N.getValueType())
29258 V = DAG.getBitcast(N.getValueType(), V);
29259
29260 // Return the new chain to replace N.
29261 return V;
29262}
29263
29264/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29265/// pshufhw.
29266///
29267/// We walk up the chain, skipping shuffles of the other half and looking
29268/// through shuffles which switch halves trying to find a shuffle of the same
29269/// pair of dwords.
29270static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29271 SelectionDAG &DAG,
29272 TargetLowering::DAGCombinerInfo &DCI) {
29273 assert((static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__))
29274 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&(static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__))
29275 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> ((N.getOpcode() == X86ISD::PSHUFLW ||
N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("(N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29275, __extension__ __PRETTY_FUNCTION__))
;
29276 SDLoc DL(N);
29277 unsigned CombineOpcode = N.getOpcode();
29278
29279 // Walk up a single-use chain looking for a combinable shuffle.
29280 SDValue V = N.getOperand(0);
29281 for (; V.hasOneUse(); V = V.getOperand(0)) {
29282 switch (V.getOpcode()) {
29283 default:
29284 return false; // Nothing combined!
29285
29286 case ISD::BITCAST:
29287 // Skip bitcasts as we always know the type for the target specific
29288 // instructions.
29289 continue;
29290
29291 case X86ISD::PSHUFLW:
29292 case X86ISD::PSHUFHW:
29293 if (V.getOpcode() == CombineOpcode)
29294 break;
29295
29296 // Other-half shuffles are no-ops.
29297 continue;
29298 }
29299 // Break out of the loop if we break out of the switch.
29300 break;
29301 }
29302
29303 if (!V.hasOneUse())
29304 // We fell out of the loop without finding a viable combining instruction.
29305 return false;
29306
29307 // Combine away the bottom node as its shuffle will be accumulated into
29308 // a preceding shuffle.
29309 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29310
29311 // Record the old value.
29312 SDValue Old = V;
29313
29314 // Merge this node's mask and our incoming mask (adjusted to account for all
29315 // the pshufd instructions encountered).
29316 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29317 for (int &M : Mask)
29318 M = VMask[M];
29319 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29320 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29321
29322 // Check that the shuffles didn't cancel each other out. If not, we need to
29323 // combine to the new one.
29324 if (Old != V)
29325 // Replace the combinable shuffle with the combined one, updating all users
29326 // so that we re-evaluate the chain here.
29327 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29328
29329 return true;
29330}
29331
29332/// \brief Try to combine x86 target specific shuffles.
29333static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29334 TargetLowering::DAGCombinerInfo &DCI,
29335 const X86Subtarget &Subtarget) {
29336 SDLoc DL(N);
29337 MVT VT = N.getSimpleValueType();
29338 SmallVector<int, 4> Mask;
29339 unsigned Opcode = N.getOpcode();
29340
29341 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29342 // single instruction.
29343 if (VT.getScalarSizeInBits() == 64 &&
29344 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29345 Opcode == X86ISD::UNPCKL)) {
29346 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29347 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29348 EVT VT0 = BC0.getValueType();
29349 EVT VT1 = BC1.getValueType();
29350 unsigned Opcode0 = BC0.getOpcode();
29351 unsigned Opcode1 = BC1.getOpcode();
29352 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29353 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29354 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29355 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29356 SDValue Lo, Hi;
29357 if (Opcode == X86ISD::MOVSD) {
29358 Lo = BC1.getOperand(0);
29359 Hi = BC0.getOperand(1);
29360 } else {
29361 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29362 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29363 }
29364 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29365 DCI.AddToWorklist(Horiz.getNode());
29366 return DAG.getBitcast(VT, Horiz);
29367 }
29368 }
29369
29370 switch (Opcode) {
29371 case X86ISD::PSHUFD:
29372 case X86ISD::PSHUFLW:
29373 case X86ISD::PSHUFHW:
29374 Mask = getPSHUFShuffleMask(N);
29375 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29375, __extension__ __PRETTY_FUNCTION__))
;
29376 break;
29377 case X86ISD::UNPCKL: {
29378 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29379 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29380 // moves upper half elements into the lower half part. For example:
29381 //
29382 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29383 // undef:v16i8
29384 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29385 //
29386 // will be combined to:
29387 //
29388 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29389
29390 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29391 // happen due to advanced instructions.
29392 if (!VT.is128BitVector())
29393 return SDValue();
29394
29395 auto Op0 = N.getOperand(0);
29396 auto Op1 = N.getOperand(1);
29397 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29398 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29399
29400 unsigned NumElts = VT.getVectorNumElements();
29401 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29402 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29403 NumElts / 2);
29404
29405 auto ShufOp = Op1.getOperand(0);
29406 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29407 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29408 }
29409 return SDValue();
29410 }
29411 case X86ISD::BLENDI: {
29412 SDValue V0 = N->getOperand(0);
29413 SDValue V1 = N->getOperand(1);
29414 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT == V0.getSimpleValueType() &&
VT == V1.getSimpleValueType() && "Unexpected input vector types"
) ? void (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29415, __extension__ __PRETTY_FUNCTION__))
29415 "Unexpected input vector types")(static_cast <bool> (VT == V0.getSimpleValueType() &&
VT == V1.getSimpleValueType() && "Unexpected input vector types"
) ? void (0) : __assert_fail ("VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && \"Unexpected input vector types\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29415, __extension__ __PRETTY_FUNCTION__))
;
29416
29417 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29418 // operands and changing the mask to 1. This saves us a bunch of
29419 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29420 // x86InstrInfo knows how to commute this back after instruction selection
29421 // if it would help register allocation.
29422
29423 // TODO: If optimizing for size or a processor that doesn't suffer from
29424 // partial register update stalls, this should be transformed into a MOVSD
29425 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29426
29427 if (VT == MVT::v2f64)
29428 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29429 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29430 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29431 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29432 }
29433
29434 return SDValue();
29435 }
29436 case X86ISD::MOVSD:
29437 case X86ISD::MOVSS: {
29438 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29439 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29440 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29441 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29442 if (isZero0 && isZero1)
29443 return SDValue();
29444
29445 // We often lower to MOVSD/MOVSS from integer as well as native float
29446 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29447 // easier to combine shuffles later on. We've already accounted for the
29448 // domain switching cost when we decided to lower with it.
29449 bool isFloat = VT.isFloatingPoint();
29450 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29451 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29452 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29453 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29454 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29455 V0 = DAG.getBitcast(NewVT, V0);
29456 V1 = DAG.getBitcast(NewVT, V1);
29457 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29458 }
29459
29460 return SDValue();
29461 }
29462 case X86ISD::INSERTPS: {
29463 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29463, __extension__ __PRETTY_FUNCTION__))
;
29464 SDValue Op0 = N.getOperand(0);
29465 SDValue Op1 = N.getOperand(1);
29466 SDValue Op2 = N.getOperand(2);
29467 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29468 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29469 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29470 unsigned ZeroMask = InsertPSMask & 0xF;
29471
29472 // If we zero out all elements from Op0 then we don't need to reference it.
29473 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29474 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29475 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29476
29477 // If we zero out the element from Op1 then we don't need to reference it.
29478 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29479 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29480 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29481
29482 // Attempt to merge insertps Op1 with an inner target shuffle node.
29483 SmallVector<int, 8> TargetMask1;
29484 SmallVector<SDValue, 2> Ops1;
29485 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29486 int M = TargetMask1[SrcIdx];
29487 if (isUndefOrZero(M)) {
29488 // Zero/UNDEF insertion - zero out element and remove dependency.
29489 InsertPSMask |= (1u << DstIdx);
29490 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29491 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29492 }
29493 // Update insertps mask srcidx and reference the source input directly.
29494 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29494, __extension__ __PRETTY_FUNCTION__))
;
29495 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29496 Op1 = Ops1[M < 4 ? 0 : 1];
29497 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29498 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29499 }
29500
29501 // Attempt to merge insertps Op0 with an inner target shuffle node.
29502 SmallVector<int, 8> TargetMask0;
29503 SmallVector<SDValue, 2> Ops0;
29504 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29505 return SDValue();
29506
29507 bool Updated = false;
29508 bool UseInput00 = false;
29509 bool UseInput01 = false;
29510 for (int i = 0; i != 4; ++i) {
29511 int M = TargetMask0[i];
29512 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29513 // No change if element is already zero or the inserted element.
29514 continue;
29515 } else if (isUndefOrZero(M)) {
29516 // If the target mask is undef/zero then we must zero the element.
29517 InsertPSMask |= (1u << i);
29518 Updated = true;
29519 continue;
29520 }
29521
29522 // The input vector element must be inline.
29523 if (M != i && M != (i + 4))
29524 return SDValue();
29525
29526 // Determine which inputs of the target shuffle we're using.
29527 UseInput00 |= (0 <= M && M < 4);
29528 UseInput01 |= (4 <= M);
29529 }
29530
29531 // If we're not using both inputs of the target shuffle then use the
29532 // referenced input directly.
29533 if (UseInput00 && !UseInput01) {
29534 Updated = true;
29535 Op0 = Ops0[0];
29536 } else if (!UseInput00 && UseInput01) {
29537 Updated = true;
29538 Op0 = Ops0[1];
29539 }
29540
29541 if (Updated)
29542 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29543 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29544
29545 return SDValue();
29546 }
29547 default:
29548 return SDValue();
29549 }
29550
29551 // Nuke no-op shuffles that show up after combining.
29552 if (isNoopShuffleMask(Mask))
29553 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29554
29555 // Look for simplifications involving one or two shuffle instructions.
29556 SDValue V = N.getOperand(0);
29557 switch (N.getOpcode()) {
29558 default:
29559 break;
29560 case X86ISD::PSHUFLW:
29561 case X86ISD::PSHUFHW:
29562 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29562, __extension__ __PRETTY_FUNCTION__))
;
29563
29564 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29565 return SDValue(); // We combined away this shuffle, so we're done.
29566
29567 // See if this reduces to a PSHUFD which is no more expensive and can
29568 // combine with more operations. Note that it has to at least flip the
29569 // dwords as otherwise it would have been removed as a no-op.
29570 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29571 int DMask[] = {0, 1, 2, 3};
29572 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29573 DMask[DOffset + 0] = DOffset + 1;
29574 DMask[DOffset + 1] = DOffset + 0;
29575 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29576 V = DAG.getBitcast(DVT, V);
29577 DCI.AddToWorklist(V.getNode());
29578 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29579 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29580 DCI.AddToWorklist(V.getNode());
29581 return DAG.getBitcast(VT, V);
29582 }
29583
29584 // Look for shuffle patterns which can be implemented as a single unpack.
29585 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29586 // only works when we have a PSHUFD followed by two half-shuffles.
29587 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
29588 (V.getOpcode() == X86ISD::PSHUFLW ||
29589 V.getOpcode() == X86ISD::PSHUFHW) &&
29590 V.getOpcode() != N.getOpcode() &&
29591 V.hasOneUse()) {
29592 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
29593 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
29594 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29595 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
29596 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29597 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29598 int WordMask[8];
29599 for (int i = 0; i < 4; ++i) {
29600 WordMask[i + NOffset] = Mask[i] + NOffset;
29601 WordMask[i + VOffset] = VMask[i] + VOffset;
29602 }
29603 // Map the word mask through the DWord mask.
29604 int MappedMask[8];
29605 for (int i = 0; i < 8; ++i)
29606 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
29607 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
29608 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
29609 // We can replace all three shuffles with an unpack.
29610 V = DAG.getBitcast(VT, D.getOperand(0));
29611 DCI.AddToWorklist(V.getNode());
29612 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
29613 : X86ISD::UNPCKH,
29614 DL, VT, V, V);
29615 }
29616 }
29617 }
29618
29619 break;
29620
29621 case X86ISD::PSHUFD:
29622 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
29623 return NewN;
29624
29625 break;
29626 }
29627
29628 return SDValue();
29629}
29630
29631/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
29632/// operation. If true is returned then the operands of ADDSUB operation
29633/// are written to the parameters \p Opnd0 and \p Opnd1.
29634///
29635/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
29636/// so it is easier to generically match. We also insert dummy vector shuffle
29637/// nodes for the operands which explicitly discard the lanes which are unused
29638/// by this operation to try to flow through the rest of the combiner
29639/// the fact that they're unused.
29640static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
29641 SDValue &Opnd0, SDValue &Opnd1) {
29642
29643 EVT VT = N->getValueType(0);
29644 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
29645 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
29646 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
29647 return false;
29648
29649 // We only handle target-independent shuffles.
29650 // FIXME: It would be easy and harmless to use the target shuffle mask
29651 // extraction tool to support more.
29652 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
29653 return false;
29654
29655 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
29656 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
29657
29658 SDValue V1 = N->getOperand(0);
29659 SDValue V2 = N->getOperand(1);
29660
29661 // We require the first shuffle operand to be the FSUB node, and the second to
29662 // be the FADD node.
29663 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
29664 ShuffleVectorSDNode::commuteMask(Mask);
29665 std::swap(V1, V2);
29666 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
29667 return false;
29668
29669 // If there are other uses of these operations we can't fold them.
29670 if (!V1->hasOneUse() || !V2->hasOneUse())
29671 return false;
29672
29673 // Ensure that both operations have the same operands. Note that we can
29674 // commute the FADD operands.
29675 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
29676 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
29677 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
29678 return false;
29679
29680 // We're looking for blends between FADD and FSUB nodes. We insist on these
29681 // nodes being lined up in a specific expected pattern.
29682 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
29683 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
29684 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
29685 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
29686 8, 25, 10, 27, 12, 29, 14, 31})))
29687 return false;
29688
29689 Opnd0 = LHS;
29690 Opnd1 = RHS;
29691 return true;
29692}
29693
29694/// \brief Try to combine a shuffle into a target-specific add-sub or
29695/// mul-add-sub node.
29696static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
29697 const X86Subtarget &Subtarget,
29698 SelectionDAG &DAG) {
29699 SDValue Opnd0, Opnd1;
29700 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
29701 return SDValue();
29702
29703 EVT VT = N->getValueType(0);
29704 SDLoc DL(N);
29705
29706 // Try to generate X86ISD::FMADDSUB node here.
29707 SDValue Opnd2;
29708 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
29709 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
29710
29711 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
29712 // the ADDSUB idiom has been successfully recognized. There are no known
29713 // X86 targets with 512-bit ADDSUB instructions!
29714 if (VT.is512BitVector())
29715 return SDValue();
29716
29717 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29718}
29719
29720// We are looking for a shuffle where both sources are concatenated with undef
29721// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29722// if we can express this as a single-source shuffle, that's preferable.
29723static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29724 const X86Subtarget &Subtarget) {
29725 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29726 return SDValue();
29727
29728 EVT VT = N->getValueType(0);
29729
29730 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29731 if (!VT.is128BitVector() && !VT.is256BitVector())
29732 return SDValue();
29733
29734 if (VT.getVectorElementType() != MVT::i32 &&
29735 VT.getVectorElementType() != MVT::i64 &&
29736 VT.getVectorElementType() != MVT::f32 &&
29737 VT.getVectorElementType() != MVT::f64)
29738 return SDValue();
29739
29740 SDValue N0 = N->getOperand(0);
29741 SDValue N1 = N->getOperand(1);
29742
29743 // Check that both sources are concats with undef.
29744 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29745 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29746 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29747 !N1.getOperand(1).isUndef())
29748 return SDValue();
29749
29750 // Construct the new shuffle mask. Elements from the first source retain their
29751 // index, but elements from the second source no longer need to skip an undef.
29752 SmallVector<int, 8> Mask;
29753 int NumElts = VT.getVectorNumElements();
29754
29755 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29756 for (int Elt : SVOp->getMask())
29757 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29758
29759 SDLoc DL(N);
29760 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29761 N1.getOperand(0));
29762 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29763}
29764
29765/// Eliminate a redundant shuffle of a horizontal math op.
29766static SDValue foldShuffleOfHorizOp(SDNode *N) {
29767 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
29768 return SDValue();
29769
29770 SDValue HOp = N->getOperand(0);
29771 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
29772 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
29773 return SDValue();
29774
29775 // 128-bit horizontal math instructions are defined to operate on adjacent
29776 // lanes of each operand as:
29777 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
29778 // ...similarly for v2f64 and v8i16.
29779 // TODO: 256-bit is not the same because...x86.
29780 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
29781 return SDValue();
29782
29783 // When the operands of a horizontal math op are identical, the low half of
29784 // the result is the same as the high half. If the shuffle is also replicating
29785 // low and high halves, we don't need the shuffle.
29786 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
29787 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
29788 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
29789 // but this should be tied to whatever horizontal op matching and shuffle
29790 // canonicalization are producing.
29791 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
29792 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
29793 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
29794 return HOp;
29795
29796 return SDValue();
29797}
29798
29799static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29800 TargetLowering::DAGCombinerInfo &DCI,
29801 const X86Subtarget &Subtarget) {
29802 SDLoc dl(N);
29803 EVT VT = N->getValueType(0);
29804 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29805 // If we have legalized the vector types, look for blends of FADD and FSUB
29806 // nodes that we can fuse into an ADDSUB node.
29807 if (TLI.isTypeLegal(VT)) {
1
Taking false branch
29808 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29809 return AddSub;
29810
29811 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
29812 return HAddSub;
29813 }
29814
29815 // During Type Legalization, when promoting illegal vector types,
29816 // the backend might introduce new shuffle dag nodes and bitcasts.
29817 //
29818 // This code performs the following transformation:
29819 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29820 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29821 //
29822 // We do this only if both the bitcast and the BINOP dag nodes have
29823 // one use. Also, perform this transformation only if the new binary
29824 // operation is legal. This is to avoid introducing dag nodes that
29825 // potentially need to be further expanded (or custom lowered) into a
29826 // less optimal sequence of dag nodes.
29827 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29828 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29829 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29830 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29831 SDValue N0 = N->getOperand(0);
29832 SDValue N1 = N->getOperand(1);
29833
29834 SDValue BC0 = N0.getOperand(0);
29835 EVT SVT = BC0.getValueType();
29836 unsigned Opcode = BC0.getOpcode();
29837 unsigned NumElts = VT.getVectorNumElements();
29838
29839 if (BC0.hasOneUse() && SVT.isVector() &&
29840 SVT.getVectorNumElements() * 2 == NumElts &&
29841 TLI.isOperationLegal(Opcode, VT)) {
29842 bool CanFold = false;
29843 switch (Opcode) {
29844 default : break;
29845 case ISD::ADD:
29846 case ISD::SUB:
29847 case ISD::MUL:
29848 // isOperationLegal lies for integer ops on floating point types.
29849 CanFold = VT.isInteger();
29850 break;
29851 case ISD::FADD:
29852 case ISD::FSUB:
29853 case ISD::FMUL:
29854 // isOperationLegal lies for floating point ops on integer types.
29855 CanFold = VT.isFloatingPoint();
29856 break;
29857 }
29858
29859 unsigned SVTNumElts = SVT.getVectorNumElements();
29860 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29861 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29862 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29863 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29864 CanFold = SVOp->getMaskElt(i) < 0;
29865
29866 if (CanFold) {
29867 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29868 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29869 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29870 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29871 }
29872 }
29873 }
29874
29875 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29876 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29877 // consecutive, non-overlapping, and in the right order.
29878 SmallVector<SDValue, 16> Elts;
29879 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
2
Assuming 'i' is equal to 'e'
3
Loop condition is false. Execution continues on line 29888
29880 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29881 Elts.push_back(Elt);
29882 continue;
29883 }
29884 Elts.clear();
29885 break;
29886 }
29887
29888 if (Elts.size() == VT.getVectorNumElements())
4
Taking true branch
29889 if (SDValue LD =
29890 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
5
Calling 'EltsFromConsecutiveLoads'
29891 return LD;
29892
29893 // For AVX2, we sometimes want to combine
29894 // (vector_shuffle <mask> (concat_vectors t1, undef)
29895 // (concat_vectors t2, undef))
29896 // Into:
29897 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29898 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29899 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29900 return ShufConcat;
29901
29902 if (isTargetShuffle(N->getOpcode())) {
29903 SDValue Op(N, 0);
29904 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29905 return Shuffle;
29906
29907 // Try recursively combining arbitrary sequences of x86 shuffle
29908 // instructions into higher-order shuffles. We do this after combining
29909 // specific PSHUF instruction sequences into their minimal form so that we
29910 // can evaluate how many specialized shuffle instructions are involved in
29911 // a particular chain.
29912 if (SDValue Res = combineX86ShufflesRecursively(
29913 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
29914 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
29915 DCI.CombineTo(N, Res);
29916 return SDValue();
29917 }
29918 }
29919
29920 return SDValue();
29921}
29922
29923/// Check if a vector extract from a target-specific shuffle of a load can be
29924/// folded into a single element load.
29925/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29926/// shuffles have been custom lowered so we need to handle those here.
29927static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29928 TargetLowering::DAGCombinerInfo &DCI) {
29929 if (DCI.isBeforeLegalizeOps())
29930 return SDValue();
29931
29932 SDValue InVec = N->getOperand(0);
29933 SDValue EltNo = N->getOperand(1);
29934 EVT EltVT = N->getValueType(0);
29935
29936 if (!isa<ConstantSDNode>(EltNo))
29937 return SDValue();
29938
29939 EVT OriginalVT = InVec.getValueType();
29940
29941 // Peek through bitcasts, don't duplicate a load with other uses.
29942 InVec = peekThroughOneUseBitcasts(InVec);
29943
29944 EVT CurrentVT = InVec.getValueType();
29945 if (!CurrentVT.isVector() ||
29946 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29947 return SDValue();
29948
29949 if (!isTargetShuffle(InVec.getOpcode()))
29950 return SDValue();
29951
29952 // Don't duplicate a load with other uses.
29953 if (!InVec.hasOneUse())
29954 return SDValue();
29955
29956 SmallVector<int, 16> ShuffleMask;
29957 SmallVector<SDValue, 2> ShuffleOps;
29958 bool UnaryShuffle;
29959 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29960 ShuffleOps, ShuffleMask, UnaryShuffle))
29961 return SDValue();
29962
29963 // Select the input vector, guarding against out of range extract vector.
29964 unsigned NumElems = CurrentVT.getVectorNumElements();
29965 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29966 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29967
29968 if (Idx == SM_SentinelZero)
29969 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29970 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29971 if (Idx == SM_SentinelUndef)
29972 return DAG.getUNDEF(EltVT);
29973
29974 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Idx && Idx < (int
)(2 * NumElems) && "Shuffle index out of range") ? void
(0) : __assert_fail ("0 <= Idx && Idx < (int)(2 * NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 29974, __extension__ __PRETTY_FUNCTION__))
;
29975 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29976 : ShuffleOps[1];
29977
29978 // If inputs to shuffle are the same for both ops, then allow 2 uses
29979 unsigned AllowedUses =
29980 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29981
29982 if (LdNode.getOpcode() == ISD::BITCAST) {
29983 // Don't duplicate a load with other uses.
29984 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29985 return SDValue();
29986
29987 AllowedUses = 1; // only allow 1 load use if we have a bitcast
29988 LdNode = LdNode.getOperand(0);
29989 }
29990
29991 if (!ISD::isNormalLoad(LdNode.getNode()))
29992 return SDValue();
29993
29994 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29995
29996 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29997 return SDValue();
29998
29999 // If there's a bitcast before the shuffle, check if the load type and
30000 // alignment is valid.
30001 unsigned Align = LN0->getAlignment();
30002 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30003 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30004 EltVT.getTypeForEVT(*DAG.getContext()));
30005
30006 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30007 return SDValue();
30008
30009 // All checks match so transform back to vector_shuffle so that DAG combiner
30010 // can finish the job
30011 SDLoc dl(N);
30012
30013 // Create shuffle node taking into account the case that its a unary shuffle
30014 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30015 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30016 ShuffleMask);
30017 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30018 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30019 EltNo);
30020}
30021
30022// Try to match patterns such as
30023// (i16 bitcast (v16i1 x))
30024// ->
30025// (i16 movmsk (16i8 sext (v16i1 x)))
30026// before the illegal vector is scalarized on subtargets that don't have legal
30027// vxi1 types.
30028static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30029 const X86Subtarget &Subtarget) {
30030 EVT VT = BitCast.getValueType();
30031 SDValue N0 = BitCast.getOperand(0);
30032 EVT VecVT = N0->getValueType(0);
30033
30034 if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
30035 N0->getOpcode() == ISD::OR) {
30036 SDValue Op0 = N0->getOperand(0);
30037 SDValue Op1 = N0->getOperand(1);
30038 MVT TrunckVT;
30039 MVT BitcastVT;
30040 switch (VT.getSimpleVT().SimpleTy) {
30041 default:
30042 return SDValue();
30043 case MVT::v16i1:
30044 TrunckVT = MVT::i8;
30045 BitcastVT = MVT::v8i1;
30046 break;
30047 case MVT::v32i1:
30048 TrunckVT = MVT::i16;
30049 BitcastVT = MVT::v16i1;
30050 break;
30051 case MVT::v64i1:
30052 TrunckVT = MVT::i32;
30053 BitcastVT = MVT::v32i1;
30054 break;
30055 }
30056 bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
30057 bool isArg0UndefLeft =
30058 Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
30059 bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
30060 bool isArg1UndefLeft =
30061 Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
30062 SDValue OpLeft;
30063 SDValue OpRight;
30064 if (isArg0UndefRight && isArg1UndefLeft) {
30065 OpLeft = Op0;
30066 OpRight = Op1;
30067 } else if (isArg1UndefRight && isArg0UndefLeft) {
30068 OpLeft = Op1;
30069 OpRight = Op0;
30070 } else
30071 return SDValue();
30072 SDLoc DL(BitCast);
30073 SDValue Shr = OpLeft->getOperand(0);
30074 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
30075 SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
30076 SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
30077 SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
30078 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
30079 }
30080
30081 if (!VT.isScalarInteger() || !VecVT.isSimple())
30082 return SDValue();
30083
30084 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30085 // MOVMSK is supported in SSE2 or later.
30086 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30087 return SDValue();
30088
30089 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30090 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30091 // v8i16 and v16i16.
30092 // For these two cases, we can shuffle the upper element bytes to a
30093 // consecutive sequence at the start of the vector and treat the results as
30094 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30095 // for v16i16 this is not the case, because the shuffle is expensive, so we
30096 // avoid sign-extending to this type entirely.
30097 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30098 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30099 MVT SExtVT;
30100 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30101 switch (VecVT.getSimpleVT().SimpleTy) {
30102 default:
30103 return SDValue();
30104 case MVT::v2i1:
30105 SExtVT = MVT::v2i64;
30106 FPCastVT = MVT::v2f64;
30107 break;
30108 case MVT::v4i1:
30109 SExtVT = MVT::v4i32;
30110 FPCastVT = MVT::v4f32;
30111 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30112 // sign-extend to a 256-bit operation to avoid truncation.
30113 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30114 N0->getOperand(0)->getValueType(0).is256BitVector()) {
30115 SExtVT = MVT::v4i64;
30116 FPCastVT = MVT::v4f64;
30117 }
30118 break;
30119 case MVT::v8i1:
30120 SExtVT = MVT::v8i16;
30121 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30122 // sign-extend to a 256-bit operation to match the compare.
30123 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30124 // 256-bit because the shuffle is cheaper than sign extending the result of
30125 // the compare.
30126 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30127 (N0->getOperand(0)->getValueType(0).is256BitVector() ||
30128 N0->getOperand(0)->getValueType(0).is512BitVector())) {
30129 SExtVT = MVT::v8i32;
30130 FPCastVT = MVT::v8f32;
30131 }
30132 break;
30133 case MVT::v16i1:
30134 SExtVT = MVT::v16i8;
30135 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30136 // it is not profitable to sign-extend to 256-bit because this will
30137 // require an extra cross-lane shuffle which is more expensive than
30138 // truncating the result of the compare to 128-bits.
30139 break;
30140 case MVT::v32i1:
30141 SExtVT = MVT::v32i8;
30142 break;
30143 };
30144
30145 SDLoc DL(BitCast);
30146 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30147
30148 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30149 // Handle pre-AVX2 cases by splitting to two v16i1's.
30150 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30151 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30152 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30153 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30154 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30155 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30156 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30157 DAG.getConstant(16, DL, ShiftTy));
30158 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30159 return DAG.getZExtOrTrunc(V, DL, VT);
30160 }
30161
30162 if (SExtVT == MVT::v8i16) {
30163 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector")(static_cast <bool> (16 == DAG.ComputeNumSignBits(V) &&
"Expected all/none bit vector") ? void (0) : __assert_fail (
"16 == DAG.ComputeNumSignBits(V) && \"Expected all/none bit vector\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30163, __extension__ __PRETTY_FUNCTION__))
;
30164 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30165 DAG.getUNDEF(MVT::v8i16));
30166 } else
30167 assert(SExtVT.getScalarType() != MVT::i16 &&(static_cast <bool> (SExtVT.getScalarType() != MVT::i16
&& "Vectors of i16 must be packed") ? void (0) : __assert_fail
("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be packed\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30168, __extension__ __PRETTY_FUNCTION__))
30168 "Vectors of i16 must be packed")(static_cast <bool> (SExtVT.getScalarType() != MVT::i16
&& "Vectors of i16 must be packed") ? void (0) : __assert_fail
("SExtVT.getScalarType() != MVT::i16 && \"Vectors of i16 must be packed\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30168, __extension__ __PRETTY_FUNCTION__))
;
30169 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30170 V = DAG.getBitcast(FPCastVT, V);
30171 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30172 return DAG.getZExtOrTrunc(V, DL, VT);
30173}
30174
30175static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30176 TargetLowering::DAGCombinerInfo &DCI,
30177 const X86Subtarget &Subtarget) {
30178 SDValue N0 = N->getOperand(0);
30179 EVT VT = N->getValueType(0);
30180 EVT SrcVT = N0.getValueType();
30181
30182 // Try to match patterns such as
30183 // (i16 bitcast (v16i1 x))
30184 // ->
30185 // (i16 movmsk (16i8 sext (v16i1 x)))
30186 // before the setcc result is scalarized on subtargets that don't have legal
30187 // vxi1 types.
30188 if (DCI.isBeforeLegalize())
30189 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30190 return V;
30191 // Since MMX types are special and don't usually play with other vector types,
30192 // it's better to handle them early to be sure we emit efficient code by
30193 // avoiding store-load conversions.
30194
30195 // Detect bitcasts between i32 to x86mmx low word.
30196 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
30197 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
30198 SDValue N00 = N0->getOperand(0);
30199 if (N00.getValueType() == MVT::i32)
30200 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30201 }
30202
30203 // Detect bitcasts between element or subvector extraction to x86mmx.
30204 if (VT == MVT::x86mmx &&
30205 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30206 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30207 isNullConstant(N0.getOperand(1))) {
30208 SDValue N00 = N0->getOperand(0);
30209 if (N00.getValueType().is128BitVector())
30210 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30211 DAG.getBitcast(MVT::v2i64, N00));
30212 }
30213
30214 // Detect bitcasts from FP_TO_SINT to x86mmx.
30215 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
30216 N0.getOpcode() == ISD::FP_TO_SINT) {
30217 SDLoc DL(N0);
30218 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30219 DAG.getUNDEF(MVT::v2i32));
30220 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30221 DAG.getBitcast(MVT::v2i64, Res));
30222 }
30223
30224 // Convert a bitcasted integer logic operation that has one bitcasted
30225 // floating-point operand into a floating-point logic operation. This may
30226 // create a load of a constant, but that is cheaper than materializing the
30227 // constant in an integer register and transferring it to an SSE register or
30228 // transferring the SSE operand to integer register and back.
30229 unsigned FPOpcode;
30230 switch (N0.getOpcode()) {
30231 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30232 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30233 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30234 default: return SDValue();
30235 }
30236
30237 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30238 (Subtarget.hasSSE2() && VT == MVT::f64)))
30239 return SDValue();
30240
30241 SDValue LogicOp0 = N0.getOperand(0);
30242 SDValue LogicOp1 = N0.getOperand(1);
30243 SDLoc DL0(N0);
30244
30245 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30246 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30247 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30248 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30249 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30250 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30251 }
30252 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30253 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30254 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30255 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30256 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30257 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30258 }
30259
30260 return SDValue();
30261}
30262
30263// Match a binop + shuffle pyramid that represents a horizontal reduction over
30264// the elements of a vector.
30265// Returns the vector that is being reduced on, or SDValue() if a reduction
30266// was not matched.
30267static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30268 ArrayRef<ISD::NodeType> CandidateBinOps) {
30269 // The pattern must end in an extract from index 0.
30270 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30271 !isNullConstant(Extract->getOperand(1)))
30272 return SDValue();
30273
30274 SDValue Op = Extract->getOperand(0);
30275 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30276
30277 // Match against one of the candidate binary ops.
30278 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30279 return Op.getOpcode() == BinOp;
30280 }))
30281 return SDValue();
30282
30283 // At each stage, we're looking for something that looks like:
30284 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30285 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30286 // i32 undef, i32 undef, i32 undef, i32 undef>
30287 // %a = binop <8 x i32> %op, %s
30288 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30289 // we expect something like:
30290 // <4,5,6,7,u,u,u,u>
30291 // <2,3,u,u,u,u,u,u>
30292 // <1,u,u,u,u,u,u,u>
30293 unsigned CandidateBinOp = Op.getOpcode();
30294 for (unsigned i = 0; i < Stages; ++i) {
30295 if (Op.getOpcode() != CandidateBinOp)
30296 return SDValue();
30297
30298 ShuffleVectorSDNode *Shuffle =
30299 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30300 if (Shuffle) {
30301 Op = Op.getOperand(1);
30302 } else {
30303 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30304 Op = Op.getOperand(0);
30305 }
30306
30307 // The first operand of the shuffle should be the same as the other operand
30308 // of the binop.
30309 if (!Shuffle || Shuffle->getOperand(0) != Op)
30310 return SDValue();
30311
30312 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30313 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30314 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30315 return SDValue();
30316 }
30317
30318 BinOp = CandidateBinOp;
30319 return Op;
30320}
30321
30322// Given a select, detect the following pattern:
30323// 1: %2 = zext <N x i8> %0 to <N x i32>
30324// 2: %3 = zext <N x i8> %1 to <N x i32>
30325// 3: %4 = sub nsw <N x i32> %2, %3
30326// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30327// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30328// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30329// This is useful as it is the input into a SAD pattern.
30330static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30331 SDValue &Op1) {
30332 // Check the condition of the select instruction is greater-than.
30333 SDValue SetCC = Select->getOperand(0);
30334 if (SetCC.getOpcode() != ISD::SETCC)
30335 return false;
30336 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30337 if (CC != ISD::SETGT && CC != ISD::SETLT)
30338 return false;
30339
30340 SDValue SelectOp1 = Select->getOperand(1);
30341 SDValue SelectOp2 = Select->getOperand(2);
30342
30343 // The following instructions assume SelectOp1 is the subtraction operand
30344 // and SelectOp2 is the negation operand.
30345 // In the case of SETLT this is the other way around.
30346 if (CC == ISD::SETLT)
30347 std::swap(SelectOp1, SelectOp2);
30348
30349 // The second operand of the select should be the negation of the first
30350 // operand, which is implemented as 0 - SelectOp1.
30351 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30352 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30353 SelectOp2.getOperand(1) == SelectOp1))
30354 return false;
30355
30356 // The first operand of SetCC is the first operand of the select, which is the
30357 // difference between the two input vectors.
30358 if (SetCC.getOperand(0) != SelectOp1)
30359 return false;
30360
30361 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30362 APInt SplatVal;
30363 if ((CC == ISD::SETLT) &&
30364 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30365 SplatVal.isOneValue()) ||
30366 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30367 return false;
30368
30369 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30370 if ((CC == ISD::SETGT) &&
30371 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30372 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30373 return false;
30374
30375 // The first operand of the select is the difference between the two input
30376 // vectors.
30377 if (SelectOp1.getOpcode() != ISD::SUB)
30378 return false;
30379
30380 Op0 = SelectOp1.getOperand(0);
30381 Op1 = SelectOp1.getOperand(1);
30382
30383 // Check if the operands of the sub are zero-extended from vectors of i8.
30384 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30385 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30386 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30387 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30388 return false;
30389
30390 return true;
30391}
30392
30393// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30394// to these zexts.
30395static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30396 const SDValue &Zext1, const SDLoc &DL) {
30397
30398 // Find the appropriate width for the PSADBW.
30399 EVT InVT = Zext0.getOperand(0).getValueType();
30400 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30401
30402 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30403 // fill in the missing vector elements with 0.
30404 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30405 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30406 Ops[0] = Zext0.getOperand(0);
30407 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30408 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30409 Ops[0] = Zext1.getOperand(0);
30410 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30411
30412 // Actually build the SAD
30413 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30414 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30415}
30416
30417// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
30418static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30419 const X86Subtarget &Subtarget) {
30420 // Bail without SSE41.
30421 if (!Subtarget.hasSSE41())
30422 return SDValue();
30423
30424 EVT ExtractVT = Extract->getValueType(0);
30425 if (ExtractVT != MVT::i16)
30426 return SDValue();
30427
30428 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30429 unsigned BinOp;
30430 SDValue Src = matchBinOpReduction(
30431 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30432 if (!Src)
30433 return SDValue();
30434
30435 EVT SrcVT = Src.getValueType();
30436 EVT SrcSVT = SrcVT.getScalarType();
30437 if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
30438 return SDValue();
30439
30440 SDLoc DL(Extract);
30441 SDValue MinPos = Src;
30442
30443 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30444 while (SrcVT.getSizeInBits() > 128) {
30445 unsigned NumElts = SrcVT.getVectorNumElements();
30446 unsigned NumSubElts = NumElts / 2;
30447 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30448 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30449 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30450 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30451 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30452 }
30453 assert(SrcVT == MVT::v8i16 && "Unexpected value type")(static_cast <bool> (SrcVT == MVT::v8i16 && "Unexpected value type"
) ? void (0) : __assert_fail ("SrcVT == MVT::v8i16 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30453, __extension__ __PRETTY_FUNCTION__))
;
30454
30455 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30456 // to flip the value accordingly.
30457 SDValue Mask;
30458 if (BinOp == ISD::SMAX)
30459 Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
30460 else if (BinOp == ISD::SMIN)
30461 Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
30462 else if (BinOp == ISD::UMAX)
30463 Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
30464
30465 if (Mask)
30466 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30467
30468 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
30469
30470 if (Mask)
30471 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30472
30473 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30474 DAG.getIntPtrConstant(0, DL));
30475}
30476
30477// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30478static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30479 SelectionDAG &DAG,
30480 const X86Subtarget &Subtarget) {
30481 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30482 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
30483 return SDValue();
30484
30485 EVT ExtractVT = Extract->getValueType(0);
30486 unsigned BitWidth = ExtractVT.getSizeInBits();
30487 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
30488 ExtractVT != MVT::i8)
30489 return SDValue();
30490
30491 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
30492 unsigned BinOp = 0;
30493 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
30494 if (!Match)
30495 return SDValue();
30496
30497 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
30498 // which we can't support here for now.
30499 if (Match.getScalarValueSizeInBits() != BitWidth)
30500 return SDValue();
30501
30502 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
30503 unsigned MatchSizeInBits = Match.getValueSizeInBits();
30504 if (!(MatchSizeInBits == 128 ||
30505 (MatchSizeInBits == 256 &&
30506 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
30507 return SDValue();
30508
30509 // Don't bother performing this for 2-element vectors.
30510 if (Match.getValueType().getVectorNumElements() <= 2)
30511 return SDValue();
30512
30513 // Check that we are extracting a reduction of all sign bits.
30514 if (DAG.ComputeNumSignBits(Match) != BitWidth)
30515 return SDValue();
30516
30517 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
30518 MVT MaskVT;
30519 if (64 == BitWidth || 32 == BitWidth)
30520 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
30521 MatchSizeInBits / BitWidth);
30522 else
30523 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
30524
30525 APInt CompareBits;
30526 ISD::CondCode CondCode;
30527 if (BinOp == ISD::OR) {
30528 // any_of -> MOVMSK != 0
30529 CompareBits = APInt::getNullValue(32);
30530 CondCode = ISD::CondCode::SETNE;
30531 } else {
30532 // all_of -> MOVMSK == ((1 << NumElts) - 1)
30533 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
30534 CondCode = ISD::CondCode::SETEQ;
30535 }
30536
30537 // Perform the select as i32/i64 and then truncate to avoid partial register
30538 // stalls.
30539 unsigned ResWidth = std::max(BitWidth, 32u);
30540 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
30541 SDLoc DL(Extract);
30542 SDValue Zero = DAG.getConstant(0, DL, ResVT);
30543 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
30544 SDValue Res = DAG.getBitcast(MaskVT, Match);
30545 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
30546 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
30547 Ones, Zero, CondCode);
30548 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
30549}
30550
30551static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
30552 const X86Subtarget &Subtarget) {
30553 // PSADBW is only supported on SSE2 and up.
30554 if (!Subtarget.hasSSE2())
30555 return SDValue();
30556
30557 // Verify the type we're extracting from is any integer type above i16.
30558 EVT VT = Extract->getOperand(0).getValueType();
30559 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
30560 return SDValue();
30561
30562 unsigned RegSize = 128;
30563 if (Subtarget.hasBWI())
30564 RegSize = 512;
30565 else if (Subtarget.hasAVX2())
30566 RegSize = 256;
30567
30568 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
30569 // TODO: We should be able to handle larger vectors by splitting them before
30570 // feeding them into several SADs, and then reducing over those.
30571 if (RegSize / VT.getVectorNumElements() < 8)
30572 return SDValue();
30573
30574 // Match shuffle + add pyramid.
30575 unsigned BinOp = 0;
30576 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
30577
30578 // The operand is expected to be zero extended from i8
30579 // (verified in detectZextAbsDiff).
30580 // In order to convert to i64 and above, additional any/zero/sign
30581 // extend is expected.
30582 // The zero extend from 32 bit has no mathematical effect on the result.
30583 // Also the sign extend is basically zero extend
30584 // (extends the sign bit which is zero).
30585 // So it is correct to skip the sign/zero extend instruction.
30586 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
30587 Root.getOpcode() == ISD::ZERO_EXTEND ||
30588 Root.getOpcode() == ISD::ANY_EXTEND))
30589 Root = Root.getOperand(0);
30590
30591 // If there was a match, we want Root to be a select that is the root of an
30592 // abs-diff pattern.
30593 if (!Root || (Root.getOpcode() != ISD::VSELECT))
30594 return SDValue();
30595
30596 // Check whether we have an abs-diff pattern feeding into the select.
30597 SDValue Zext0, Zext1;
30598 if (!detectZextAbsDiff(Root, Zext0, Zext1))
30599 return SDValue();
30600
30601 // Create the SAD instruction.
30602 SDLoc DL(Extract);
30603 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
30604
30605 // If the original vector was wider than 8 elements, sum over the results
30606 // in the SAD vector.
30607 unsigned Stages = Log2_32(VT.getVectorNumElements());
30608 MVT SadVT = SAD.getSimpleValueType();
30609 if (Stages > 3) {
30610 unsigned SadElems = SadVT.getVectorNumElements();
30611
30612 for(unsigned i = Stages - 3; i > 0; --i) {
30613 SmallVector<int, 16> Mask(SadElems, -1);
30614 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
30615 Mask[j] = MaskEnd + j;
30616
30617 SDValue Shuffle =
30618 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
30619 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
30620 }
30621 }
30622
30623 MVT Type = Extract->getSimpleValueType(0);
30624 unsigned TypeSizeInBits = Type.getSizeInBits();
30625 // Return the lowest TypeSizeInBits bits.
30626 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
30627 SAD = DAG.getBitcast(ResVT, SAD);
30628 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
30629 Extract->getOperand(1));
30630}
30631
30632// Attempt to peek through a target shuffle and extract the scalar from the
30633// source.
30634static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
30635 TargetLowering::DAGCombinerInfo &DCI,
30636 const X86Subtarget &Subtarget) {
30637 if (DCI.isBeforeLegalizeOps())
30638 return SDValue();
30639
30640 SDValue Src = N->getOperand(0);
30641 SDValue Idx = N->getOperand(1);
30642
30643 EVT VT = N->getValueType(0);
30644 EVT SrcVT = Src.getValueType();
30645 EVT SrcSVT = SrcVT.getVectorElementType();
30646 unsigned NumSrcElts = SrcVT.getVectorNumElements();
30647
30648 // Don't attempt this for boolean mask vectors or unknown extraction indices.
30649 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
30650 return SDValue();
30651
30652 // Resolve the target shuffle inputs and mask.
30653 SmallVector<int, 16> Mask;
30654 SmallVector<SDValue, 2> Ops;
30655 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
30656 return SDValue();
30657
30658 // Attempt to narrow/widen the shuffle mask to the correct size.
30659 if (Mask.size() != NumSrcElts) {
30660 if ((NumSrcElts % Mask.size()) == 0) {
30661 SmallVector<int, 16> ScaledMask;
30662 int Scale = NumSrcElts / Mask.size();
30663 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
30664 Mask = std::move(ScaledMask);
30665 } else if ((Mask.size() % NumSrcElts) == 0) {
30666 SmallVector<int, 16> WidenedMask;
30667 while (Mask.size() > NumSrcElts &&
30668 canWidenShuffleElements(Mask, WidenedMask))
30669 Mask = std::move(WidenedMask);
30670 // TODO - investigate support for wider shuffle masks with known upper
30671 // undef/zero elements for implicit zero-extension.
30672 }
30673 }
30674
30675 // Check if narrowing/widening failed.
30676 if (Mask.size() != NumSrcElts)
30677 return SDValue();
30678
30679 int SrcIdx = Mask[N->getConstantOperandVal(1)];
30680 SDLoc dl(N);
30681
30682 // If the shuffle source element is undef/zero then we can just accept it.
30683 if (SrcIdx == SM_SentinelUndef)
30684 return DAG.getUNDEF(VT);
30685
30686 if (SrcIdx == SM_SentinelZero)
30687 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
30688 : DAG.getConstant(0, dl, VT);
30689
30690 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
30691 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
30692 SrcIdx = SrcIdx % Mask.size();
30693
30694 // We can only extract other elements from 128-bit vectors and in certain
30695 // circumstances, depending on SSE-level.
30696 // TODO: Investigate using extract_subvector for larger vectors.
30697 // TODO: Investigate float/double extraction if it will be just stored.
30698 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
30699 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
30700 assert(SrcSVT == VT && "Unexpected extraction type")(static_cast <bool> (SrcSVT == VT && "Unexpected extraction type"
) ? void (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30700, __extension__ __PRETTY_FUNCTION__))
;
30701 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
30702 DAG.getIntPtrConstant(SrcIdx, dl));
30703 }
30704
30705 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
30706 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
30707 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() >= SrcSVT.getSizeInBits
() && "Unexpected extraction type") ? void (0) : __assert_fail
("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30708, __extension__ __PRETTY_FUNCTION__))
30708 "Unexpected extraction type")(static_cast <bool> (VT.getSizeInBits() >= SrcSVT.getSizeInBits
() && "Unexpected extraction type") ? void (0) : __assert_fail
("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30708, __extension__ __PRETTY_FUNCTION__))
;
30709 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
30710 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
30711 DAG.getIntPtrConstant(SrcIdx, dl));
30712 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
30713 }
30714
30715 return SDValue();
30716}
30717
30718/// Detect vector gather/scatter index generation and convert it from being a
30719/// bunch of shuffles and extracts into a somewhat faster sequence.
30720/// For i686, the best sequence is apparently storing the value and loading
30721/// scalars back, while for x64 we should use 64-bit extracts and shifts.
30722static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
30723 TargetLowering::DAGCombinerInfo &DCI,
30724 const X86Subtarget &Subtarget) {
30725 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
30726 return NewOp;
30727
30728 // TODO - Remove this once we can handle the implicit zero-extension of
30729 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
30730 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30731 // combineBasicSADPattern.
30732 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30733 return SDValue();
30734
30735 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
30736 return NewOp;
30737
30738 SDValue InputVector = N->getOperand(0);
30739 SDValue EltIdx = N->getOperand(1);
30740
30741 EVT SrcVT = InputVector.getValueType();
30742 EVT VT = N->getValueType(0);
30743 SDLoc dl(InputVector);
30744
30745 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
30746 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30747 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
30748 SDValue MMXSrc = InputVector.getOperand(0);
30749
30750 // The bitcast source is a direct mmx result.
30751 if (MMXSrc.getValueType() == MVT::x86mmx)
30752 return DAG.getBitcast(VT, InputVector);
30753 }
30754
30755 // Detect mmx to i32 conversion through a v2i32 elt extract.
30756 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30757 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
30758 SDValue MMXSrc = InputVector.getOperand(0);
30759
30760 // The bitcast source is a direct mmx result.
30761 if (MMXSrc.getValueType() == MVT::x86mmx)
30762 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
30763 }
30764
30765 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
30766 isa<ConstantSDNode>(EltIdx) &&
30767 isa<ConstantSDNode>(InputVector.getOperand(0))) {
30768 uint64_t ExtractedElt = N->getConstantOperandVal(1);
30769 uint64_t InputValue = InputVector.getConstantOperandVal(0);
30770 uint64_t Res = (InputValue >> ExtractedElt) & 1;
30771 return DAG.getConstant(Res, dl, MVT::i1);
30772 }
30773
30774 // Check whether this extract is the root of a sum of absolute differences
30775 // pattern. This has to be done here because we really want it to happen
30776 // pre-legalization,
30777 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
30778 return SAD;
30779
30780 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
30781 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
30782 return Cmp;
30783
30784 // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
30785 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
30786 return MinMax;
30787
30788 // Only operate on vectors of 4 elements, where the alternative shuffling
30789 // gets to be more expensive.
30790 if (SrcVT != MVT::v4i32)
30791 return SDValue();
30792
30793 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
30794 // single use which is a sign-extend or zero-extend, and all elements are
30795 // used.
30796 SmallVector<SDNode *, 4> Uses;
30797 unsigned ExtractedElements = 0;
30798 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
30799 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
30800 if (UI.getUse().getResNo() != InputVector.getResNo())
30801 return SDValue();
30802
30803 SDNode *Extract = *UI;
30804 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30805 return SDValue();
30806
30807 if (Extract->getValueType(0) != MVT::i32)
30808 return SDValue();
30809 if (!Extract->hasOneUse())
30810 return SDValue();
30811 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
30812 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
30813 return SDValue();
30814 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
30815 return SDValue();
30816
30817 // Record which element was extracted.
30818 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
30819 Uses.push_back(Extract);
30820 }
30821
30822 // If not all the elements were used, this may not be worthwhile.
30823 if (ExtractedElements != 15)
30824 return SDValue();
30825
30826 // Ok, we've now decided to do the transformation.
30827 // If 64-bit shifts are legal, use the extract-shift sequence,
30828 // otherwise bounce the vector off the cache.
30829 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30830 SDValue Vals[4];
30831
30832 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
30833 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
30834 auto &DL = DAG.getDataLayout();
30835 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
30836 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30837 DAG.getConstant(0, dl, VecIdxTy));
30838 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30839 DAG.getConstant(1, dl, VecIdxTy));
30840
30841 SDValue ShAmt = DAG.getConstant(
30842 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
30843 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
30844 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30845 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
30846 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
30847 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30848 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
30849 } else {
30850 // Store the value to a temporary stack slot.
30851 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
30852 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
30853 MachinePointerInfo());
30854
30855 EVT ElementType = SrcVT.getVectorElementType();
30856 unsigned EltSize = ElementType.getSizeInBits() / 8;
30857
30858 // Replace each use (extract) with a load of the appropriate element.
30859 for (unsigned i = 0; i < 4; ++i) {
30860 uint64_t Offset = EltSize * i;
30861 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
30862 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
30863
30864 SDValue ScalarAddr =
30865 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
30866
30867 // Load the scalar.
30868 Vals[i] =
30869 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
30870 }
30871 }
30872
30873 // Replace the extracts
30874 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30875 UE = Uses.end(); UI != UE; ++UI) {
30876 SDNode *Extract = *UI;
30877
30878 uint64_t IdxVal = Extract->getConstantOperandVal(1);
30879 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30880 }
30881
30882 // The replacement was made in place; don't return anything.
30883 return SDValue();
30884}
30885
30886/// If a vector select has an operand that is -1 or 0, try to simplify the
30887/// select to a bitwise logic operation.
30888/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
30889static SDValue
30890combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30891 TargetLowering::DAGCombinerInfo &DCI,
30892 const X86Subtarget &Subtarget) {
30893 SDValue Cond = N->getOperand(0);
30894 SDValue LHS = N->getOperand(1);
30895 SDValue RHS = N->getOperand(2);
30896 EVT VT = LHS.getValueType();
30897 EVT CondVT = Cond.getValueType();
30898 SDLoc DL(N);
30899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30900
30901 if (N->getOpcode() != ISD::VSELECT)
30902 return SDValue();
30903
30904 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 30904, __extension__ __PRETTY_FUNCTION__))
;
30905
30906 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30907 // Check if the first operand is all zeros and Cond type is vXi1.
30908 // This situation only applies to avx512.
30909 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30910 CondVT.getVectorElementType() == MVT::i1) {
30911 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30912 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30913 DAG.getAllOnesConstant(DL, CondVT));
30914 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30915 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30916 }
30917
30918 // To use the condition operand as a bitwise mask, it must have elements that
30919 // are the same size as the select elements. Ie, the condition operand must
30920 // have already been promoted from the IR select condition type <N x i1>.
30921 // Don't check if the types themselves are equal because that excludes
30922 // vector floating-point selects.
30923 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30924 return SDValue();
30925
30926 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30927 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30928
30929 // Try to invert the condition if true value is not all 1s and false value is
30930 // not all 0s.
30931 if (!TValIsAllOnes && !FValIsAllZeros &&
30932 // Check if the selector will be produced by CMPP*/PCMP*.
30933 Cond.getOpcode() == ISD::SETCC &&
30934 // Check if SETCC has already been promoted.
30935 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30936 CondVT) {
30937 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30938
30939 if (TValIsAllZeros || FValIsAllOnes) {
30940 SDValue CC = Cond.getOperand(2);
30941 ISD::CondCode NewCC =
30942 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30943 Cond.getOperand(0).getValueType().isInteger());
30944 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30945 NewCC);
30946 std::swap(LHS, RHS);
30947 TValIsAllOnes = FValIsAllOnes;
30948 FValIsAllZeros = TValIsAllZeros;
30949 }
30950 }
30951
30952 // Cond value must be 'sign splat' to be converted to a logical op.
30953 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
30954 return SDValue();
30955
30956 // vselect Cond, 111..., 000... -> Cond
30957 if (TValIsAllOnes && FValIsAllZeros)
30958 return DAG.getBitcast(VT, Cond);
30959
30960 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30961 return SDValue();
30962
30963 // vselect Cond, 111..., X -> or Cond, X
30964 if (TValIsAllOnes) {
30965 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30966 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30967 return DAG.getBitcast(VT, Or);
30968 }
30969
30970 // vselect Cond, X, 000... -> and Cond, X
30971 if (FValIsAllZeros) {
30972 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30973 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30974 return DAG.getBitcast(VT, And);
30975 }
30976
30977 // vselect Cond, 000..., X -> andn Cond, X
30978 if (TValIsAllZeros) {
30979 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
30980 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
30981 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
30982 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
30983 return DAG.getBitcast(VT, AndN);
30984 }
30985
30986 return SDValue();
30987}
30988
30989static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30990 SDValue Cond = N->getOperand(0);
30991 SDValue LHS = N->getOperand(1);
30992 SDValue RHS = N->getOperand(2);
30993 SDLoc DL(N);
30994
30995 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30996 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30997 if (!TrueC || !FalseC)
30998 return SDValue();
30999
31000 // Don't do this for crazy integer types.
31001 EVT VT = N->getValueType(0);
31002 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31003 return SDValue();
31004
31005 // We're going to use the condition bit in math or logic ops. We could allow
31006 // this with a wider condition value (post-legalization it becomes an i8),
31007 // but if nothing is creating selects that late, it doesn't matter.
31008 if (Cond.getValueType() != MVT::i1)
31009 return SDValue();
31010
31011 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31012 // 3, 5, or 9 with i32/i64, so those get transformed too.
31013 // TODO: For constants that overflow or do not differ by power-of-2 or small
31014 // multiplier, convert to 'and' + 'add'.
31015 const APInt &TrueVal = TrueC->getAPIntValue();
31016 const APInt &FalseVal = FalseC->getAPIntValue();
31017 bool OV;
31018 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31019 if (OV)
31020 return SDValue();
31021
31022 APInt AbsDiff = Diff.abs();
31023 if (AbsDiff.isPowerOf2() ||
31024 ((VT == MVT::i32 || VT == MVT::i64) &&
31025 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31026
31027 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31028 // of the condition can usually be folded into a compare predicate, but even
31029 // without that, the sequence should be cheaper than a CMOV alternative.
31030 if (TrueVal.slt(FalseVal)) {
31031 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31032 std::swap(TrueC, FalseC);
31033 }
31034
31035 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31036 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31037
31038 // Multiply condition by the difference if non-one.
31039 if (!AbsDiff.isOneValue())
31040 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31041
31042 // Add the base if non-zero.
31043 if (!FalseC->isNullValue())
31044 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31045
31046 return R;
31047 }
31048
31049 return SDValue();
31050}
31051
31052// If this is a bitcasted op that can be represented as another type, push the
31053// the bitcast to the inputs. This allows more opportunities for pattern
31054// matching masked instructions. This is called when we know that the operation
31055// is used as one of the inputs of a vselect.
31056static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
31057 TargetLowering::DAGCombinerInfo &DCI) {
31058 // Make sure we have a bitcast.
31059 if (OrigOp.getOpcode() != ISD::BITCAST)
31060 return false;
31061
31062 SDValue Op = OrigOp.getOperand(0);
31063
31064 // If the operation is used by anything other than the bitcast, we shouldn't
31065 // do this combine as that would replicate the operation.
31066 if (!Op.hasOneUse())
31067 return false;
31068
31069 MVT VT = OrigOp.getSimpleValueType();
31070 MVT EltVT = VT.getVectorElementType();
31071 SDLoc DL(Op.getNode());
31072
31073 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
31074 SDValue Op2) {
31075 Op0 = DAG.getBitcast(VT, Op0);
31076 DCI.AddToWorklist(Op0.getNode());
31077 Op1 = DAG.getBitcast(VT, Op1);
31078 DCI.AddToWorklist(Op1.getNode());
31079 DCI.CombineTo(OrigOp.getNode(),
31080 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
31081 return true;
31082 };
31083
31084 unsigned Opcode = Op.getOpcode();
31085 switch (Opcode) {
31086 case X86ISD::SHUF128: {
31087 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
31088 return false;
31089 // Only change element size, not type.
31090 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31091 return false;
31092 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
31093 Op.getOperand(2));
31094 }
31095 case X86ISD::SUBV_BROADCAST: {
31096 unsigned EltSize = EltVT.getSizeInBits();
31097 if (EltSize != 32 && EltSize != 64)
31098 return false;
31099 // Only change element size, not type.
31100 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31101 return false;
31102 SDValue Op0 = Op.getOperand(0);
31103 MVT Op0VT = MVT::getVectorVT(EltVT,
31104 Op0.getSimpleValueType().getSizeInBits() / EltSize);
31105 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
31106 DCI.AddToWorklist(Op0.getNode());
31107 DCI.CombineTo(OrigOp.getNode(),
31108 DAG.getNode(Opcode, DL, VT, Op0));
31109 return true;
31110 }
31111 }
31112
31113 return false;
31114}
31115
31116/// Do target-specific dag combines on SELECT and VSELECT nodes.
31117static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31118 TargetLowering::DAGCombinerInfo &DCI,
31119 const X86Subtarget &Subtarget) {
31120 SDLoc DL(N);
31121 SDValue Cond = N->getOperand(0);
31122 // Get the LHS/RHS of the select.
31123 SDValue LHS = N->getOperand(1);
31124 SDValue RHS = N->getOperand(2);
31125 EVT VT = LHS.getValueType();
31126 EVT CondVT = Cond.getValueType();
31127 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31128
31129 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31130 // instructions match the semantics of the common C idiom x<y?x:y but not
31131 // x<=y?x:y, because of how they handle negative zero (which can be
31132 // ignored in unsafe-math mode).
31133 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31134 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31135 VT != MVT::f80 && VT != MVT::f128 &&
31136 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31137 (Subtarget.hasSSE2() ||
31138 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31139 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31140
31141 unsigned Opcode = 0;
31142 // Check for x CC y ? x : y.
31143 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31144 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31145 switch (CC) {
31146 default: break;
31147 case ISD::SETULT:
31148 // Converting this to a min would handle NaNs incorrectly, and swapping
31149 // the operands would cause it to handle comparisons between positive
31150 // and negative zero incorrectly.
31151 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31152 if (!DAG.getTarget().Options.UnsafeFPMath &&
31153 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31154 break;
31155 std::swap(LHS, RHS);
31156 }
31157 Opcode = X86ISD::FMIN;
31158 break;
31159 case ISD::SETOLE:
31160 // Converting this to a min would handle comparisons between positive
31161 // and negative zero incorrectly.
31162 if (!DAG.getTarget().Options.UnsafeFPMath &&
31163 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31164 break;
31165 Opcode = X86ISD::FMIN;
31166 break;
31167 case ISD::SETULE:
31168 // Converting this to a min would handle both negative zeros and NaNs
31169 // incorrectly, but we can swap the operands to fix both.
31170 std::swap(LHS, RHS);
31171 LLVM_FALLTHROUGH[[clang::fallthrough]];
31172 case ISD::SETOLT:
31173 case ISD::SETLT:
31174 case ISD::SETLE:
31175 Opcode = X86ISD::FMIN;
31176 break;
31177
31178 case ISD::SETOGE:
31179 // Converting this to a max would handle comparisons between positive
31180 // and negative zero incorrectly.
31181 if (!DAG.getTarget().Options.UnsafeFPMath &&
31182 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31183 break;
31184 Opcode = X86ISD::FMAX;
31185 break;
31186 case ISD::SETUGT:
31187 // Converting this to a max would handle NaNs incorrectly, and swapping
31188 // the operands would cause it to handle comparisons between positive
31189 // and negative zero incorrectly.
31190 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31191 if (!DAG.getTarget().Options.UnsafeFPMath &&
31192 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31193 break;
31194 std::swap(LHS, RHS);
31195 }
31196 Opcode = X86ISD::FMAX;
31197 break;
31198 case ISD::SETUGE:
31199 // Converting this to a max would handle both negative zeros and NaNs
31200 // incorrectly, but we can swap the operands to fix both.
31201 std::swap(LHS, RHS);
31202 LLVM_FALLTHROUGH[[clang::fallthrough]];
31203 case ISD::SETOGT:
31204 case ISD::SETGT:
31205 case ISD::SETGE:
31206 Opcode = X86ISD::FMAX;
31207 break;
31208 }
31209 // Check for x CC y ? y : x -- a min/max with reversed arms.
31210 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31211 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31212 switch (CC) {
31213 default: break;
31214 case ISD::SETOGE:
31215 // Converting this to a min would handle comparisons between positive
31216 // and negative zero incorrectly, and swapping the operands would
31217 // cause it to handle NaNs incorrectly.
31218 if (!DAG.getTarget().Options.UnsafeFPMath &&
31219 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31220 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31221 break;
31222 std::swap(LHS, RHS);
31223 }
31224 Opcode = X86ISD::FMIN;
31225 break;
31226 case ISD::SETUGT:
31227 // Converting this to a min would handle NaNs incorrectly.
31228 if (!DAG.getTarget().Options.UnsafeFPMath &&
31229 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31230 break;
31231 Opcode = X86ISD::FMIN;
31232 break;
31233 case ISD::SETUGE:
31234 // Converting this to a min would handle both negative zeros and NaNs
31235 // incorrectly, but we can swap the operands to fix both.
31236 std::swap(LHS, RHS);
31237 LLVM_FALLTHROUGH[[clang::fallthrough]];
31238 case ISD::SETOGT:
31239 case ISD::SETGT:
31240 case ISD::SETGE:
31241 Opcode = X86ISD::FMIN;
31242 break;
31243
31244 case ISD::SETULT:
31245 // Converting this to a max would handle NaNs incorrectly.
31246 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31247 break;
31248 Opcode = X86ISD::FMAX;
31249 break;
31250 case ISD::SETOLE:
31251 // Converting this to a max would handle comparisons between positive
31252 // and negative zero incorrectly, and swapping the operands would
31253 // cause it to handle NaNs incorrectly.
31254 if (!DAG.getTarget().Options.UnsafeFPMath &&
31255 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31256 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31257 break;
31258 std::swap(LHS, RHS);
31259 }
31260 Opcode = X86ISD::FMAX;
31261 break;
31262 case ISD::SETULE:
31263 // Converting this to a max would handle both negative zeros and NaNs
31264 // incorrectly, but we can swap the operands to fix both.
31265 std::swap(LHS, RHS);
31266 LLVM_FALLTHROUGH[[clang::fallthrough]];
31267 case ISD::SETOLT:
31268 case ISD::SETLT:
31269 case ISD::SETLE:
31270 Opcode = X86ISD::FMAX;
31271 break;
31272 }
31273 }
31274
31275 if (Opcode)
31276 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31277 }
31278
31279 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31280 // lowering on KNL. In this case we convert it to
31281 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31282 // The same situation for all 128 and 256-bit vectors of i8 and i16.
31283 // Since SKX these selects have a proper lowering.
31284 if (Subtarget.hasAVX512() && CondVT.isVector() &&
31285 CondVT.getVectorElementType() == MVT::i1 &&
31286 (VT.is128BitVector() || VT.is256BitVector()) &&
31287 (VT.getVectorElementType() == MVT::i8 ||
31288 VT.getVectorElementType() == MVT::i16) &&
31289 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
31290 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31291 DCI.AddToWorklist(Cond.getNode());
31292 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31293 }
31294
31295 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31296 return V;
31297
31298 // Canonicalize max and min:
31299 // (x > y) ? x : y -> (x >= y) ? x : y
31300 // (x < y) ? x : y -> (x <= y) ? x : y
31301 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31302 // the need for an extra compare
31303 // against zero. e.g.
31304 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31305 // subl %esi, %edi
31306 // testl %edi, %edi
31307 // movl $0, %eax
31308 // cmovgl %edi, %eax
31309 // =>
31310 // xorl %eax, %eax
31311 // subl %esi, $edi
31312 // cmovsl %eax, %edi
31313 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31314 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31315 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31316 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31317 switch (CC) {
31318 default: break;
31319 case ISD::SETLT:
31320 case ISD::SETGT: {
31321 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31322 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31323 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31324 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31325 }
31326 }
31327 }
31328
31329 // Early exit check
31330 if (!TLI.isTypeLegal(VT))
31331 return SDValue();
31332
31333 // Match VSELECTs into subs with unsigned saturation.
31334 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31335 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31336 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31337 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31338 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31339
31340 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31341 // left side invert the predicate to simplify logic below.
31342 SDValue Other;
31343 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31344 Other = RHS;
31345 CC = ISD::getSetCCInverse(CC, true);
31346 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31347 Other = LHS;
31348 }
31349
31350 if (Other.getNode() && Other->getNumOperands() == 2 &&
31351 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31352 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31353 SDValue CondRHS = Cond->getOperand(1);
31354
31355 // Look for a general sub with unsigned saturation first.
31356 // x >= y ? x-y : 0 --> subus x, y
31357 // x > y ? x-y : 0 --> subus x, y
31358 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31359 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31360 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31361
31362 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31363 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31364 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31365 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31366 // If the RHS is a constant we have to reverse the const
31367 // canonicalization.
31368 // x > C-1 ? x+-C : 0 --> subus x, C
31369 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31370 CondRHSConst->getAPIntValue() ==
31371 (-OpRHSConst->getAPIntValue() - 1))
31372 return DAG.getNode(
31373 X86ISD::SUBUS, DL, VT, OpLHS,
31374 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31375
31376 // Another special case: If C was a sign bit, the sub has been
31377 // canonicalized into a xor.
31378 // FIXME: Would it be better to use computeKnownBits to determine
31379 // whether it's safe to decanonicalize the xor?
31380 // x s< 0 ? x^C : 0 --> subus x, C
31381 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31382 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31383 OpRHSConst->getAPIntValue().isSignMask())
31384 // Note that we have to rebuild the RHS constant here to ensure we
31385 // don't rely on particular values of undef lanes.
31386 return DAG.getNode(
31387 X86ISD::SUBUS, DL, VT, OpLHS,
31388 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31389 }
31390 }
31391 }
31392
31393 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31394 return V;
31395
31396 // If this is a *dynamic* select (non-constant condition) and we can match
31397 // this node with one of the variable blend instructions, restructure the
31398 // condition so that blends can use the high (sign) bit of each element and
31399 // use SimplifyDemandedBits to simplify the condition operand.
31400 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31401 !DCI.isBeforeLegalize() &&
31402 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31403 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31404
31405 // Don't optimize vector selects that map to mask-registers.
31406 if (BitWidth == 1)
31407 return SDValue();
31408
31409 // We can only handle the cases where VSELECT is directly legal on the
31410 // subtarget. We custom lower VSELECT nodes with constant conditions and
31411 // this makes it hard to see whether a dynamic VSELECT will correctly
31412 // lower, so we both check the operation's status and explicitly handle the
31413 // cases where a *dynamic* blend will fail even though a constant-condition
31414 // blend could be custom lowered.
31415 // FIXME: We should find a better way to handle this class of problems.
31416 // Potentially, we should combine constant-condition vselect nodes
31417 // pre-legalization into shuffles and not mark as many types as custom
31418 // lowered.
31419 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31420 return SDValue();
31421 // FIXME: We don't support i16-element blends currently. We could and
31422 // should support them by making *all* the bits in the condition be set
31423 // rather than just the high bit and using an i8-element blend.
31424 if (VT.getVectorElementType() == MVT::i16)
31425 return SDValue();
31426 // Dynamic blending was only available from SSE4.1 onward.
31427 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31428 return SDValue();
31429 // Byte blends are only available in AVX2
31430 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31431 return SDValue();
31432 // There are no 512-bit blend instructions that use sign bits.
31433 if (VT.is512BitVector())
31434 return SDValue();
31435
31436 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size")(static_cast <bool> (BitWidth >= 8 && BitWidth
<= 64 && "Invalid mask size") ? void (0) : __assert_fail
("BitWidth >= 8 && BitWidth <= 64 && \"Invalid mask size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31436, __extension__ __PRETTY_FUNCTION__))
;
31437 APInt DemandedMask(APInt::getSignMask(BitWidth));
31438 KnownBits Known;
31439 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31440 !DCI.isBeforeLegalizeOps());
31441 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31442 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31443 // If we changed the computation somewhere in the DAG, this change will
31444 // affect all users of Cond. Make sure it is fine and update all the nodes
31445 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31446 // perform wrong optimizations as we messed with the actual expectation
31447 // for the vector boolean values.
31448 if (Cond != TLO.Old) {
31449 // Check all uses of the condition operand to check whether it will be
31450 // consumed by non-BLEND instructions. Those may require that all bits
31451 // are set properly.
31452 for (SDNode *U : Cond->uses()) {
31453 // TODO: Add other opcodes eventually lowered into BLEND.
31454 if (U->getOpcode() != ISD::VSELECT)
31455 return SDValue();
31456 }
31457
31458 // Update all users of the condition before committing the change, so
31459 // that the VSELECT optimizations that expect the correct vector boolean
31460 // value will not be triggered.
31461 for (SDNode *U : Cond->uses()) {
31462 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31463 U->getValueType(0), Cond, U->getOperand(1),
31464 U->getOperand(2));
31465 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31466 }
31467 DCI.CommitTargetLoweringOpt(TLO);
31468 return SDValue();
31469 }
31470 // Only Cond (rather than other nodes in the computation chain) was
31471 // changed. Change the condition just for N to keep the opportunity to
31472 // optimize all other users their own way.
31473 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31474 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31475 return SDValue();
31476 }
31477 }
31478
31479 // Look for vselects with LHS/RHS being bitcasted from an operation that
31480 // can be executed on another type. Push the bitcast to the inputs of
31481 // the operation. This exposes opportunities for using masking instructions.
31482 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
31483 CondVT.getVectorElementType() == MVT::i1) {
31484 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
31485 return SDValue(N, 0);
31486 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
31487 return SDValue(N, 0);
31488 }
31489
31490 // Custom action for SELECT MMX
31491 if (VT == MVT::x86mmx) {
31492 LHS = DAG.getBitcast(MVT::i64, LHS);
31493 RHS = DAG.getBitcast(MVT::i64, RHS);
31494 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31495 return DAG.getBitcast(VT, newSelect);
31496 }
31497
31498 return SDValue();
31499}
31500
31501/// Combine:
31502/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31503/// to:
31504/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31505/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31506/// Note that this is only legal for some op/cc combinations.
31507static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31508 SelectionDAG &DAG,
31509 const X86Subtarget &Subtarget) {
31510 // This combine only operates on CMP-like nodes.
31511 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31512 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31513 return SDValue();
31514
31515 // Can't replace the cmp if it has more uses than the one we're looking at.
31516 // FIXME: We would like to be able to handle this, but would need to make sure
31517 // all uses were updated.
31518 if (!Cmp.hasOneUse())
31519 return SDValue();
31520
31521 // This only applies to variations of the common case:
31522 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31523 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31524 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31525 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31526 // Using the proper condcodes (see below), overflow is checked for.
31527
31528 // FIXME: We can generalize both constraints:
31529 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31530 // - LHS != 1
31531 // if the result is compared.
31532
31533 SDValue CmpLHS = Cmp.getOperand(0);
31534 SDValue CmpRHS = Cmp.getOperand(1);
31535
31536 if (!CmpLHS.hasOneUse())
31537 return SDValue();
31538
31539 unsigned Opc = CmpLHS.getOpcode();
31540 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31541 return SDValue();
31542
31543 SDValue OpRHS = CmpLHS.getOperand(2);
31544 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31545 if (!OpRHSC)
31546 return SDValue();
31547
31548 APInt Addend = OpRHSC->getAPIntValue();
31549 if (Opc == ISD::ATOMIC_LOAD_SUB)
31550 Addend = -Addend;
31551
31552 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31553 if (!CmpRHSC)
31554 return SDValue();
31555
31556 APInt Comparison = CmpRHSC->getAPIntValue();
31557
31558 // If the addend is the negation of the comparison value, then we can do
31559 // a full comparison by emitting the atomic arithmetic as a locked sub.
31560 if (Comparison == -Addend) {
31561 // The CC is fine, but we need to rewrite the LHS of the comparison as an
31562 // atomic sub.
31563 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
31564 auto AtomicSub = DAG.getAtomic(
31565 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
31566 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
31567 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
31568 AN->getMemOperand());
31569 // If the comparision uses the CF flag we can't use INC/DEC instructions.
31570 bool NeedCF = false;
31571 switch (CC) {
31572 default: break;
31573 case X86::COND_A: case X86::COND_AE:
31574 case X86::COND_B: case X86::COND_BE:
31575 NeedCF = true;
31576 break;
31577 }
31578 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
31579 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31580 DAG.getUNDEF(CmpLHS.getValueType()));
31581 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31582 return LockOp;
31583 }
31584
31585 // We can handle comparisons with zero in a number of cases by manipulating
31586 // the CC used.
31587 if (!Comparison.isNullValue())
31588 return SDValue();
31589
31590 if (CC == X86::COND_S && Addend == 1)
31591 CC = X86::COND_LE;
31592 else if (CC == X86::COND_NS && Addend == 1)
31593 CC = X86::COND_G;
31594 else if (CC == X86::COND_G && Addend == -1)
31595 CC = X86::COND_GE;
31596 else if (CC == X86::COND_LE && Addend == -1)
31597 CC = X86::COND_L;
31598 else
31599 return SDValue();
31600
31601 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
31602 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31603 DAG.getUNDEF(CmpLHS.getValueType()));
31604 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31605 return LockOp;
31606}
31607
31608// Check whether a boolean test is testing a boolean value generated by
31609// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
31610// code.
31611//
31612// Simplify the following patterns:
31613// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
31614// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
31615// to (Op EFLAGS Cond)
31616//
31617// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
31618// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
31619// to (Op EFLAGS !Cond)
31620//
31621// where Op could be BRCOND or CMOV.
31622//
31623static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
31624 // This combine only operates on CMP-like nodes.
31625 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31626 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31627 return SDValue();
31628
31629 // Quit if not used as a boolean value.
31630 if (CC != X86::COND_E && CC != X86::COND_NE)
31631 return SDValue();
31632
31633 // Check CMP operands. One of them should be 0 or 1 and the other should be
31634 // an SetCC or extended from it.
31635 SDValue Op1 = Cmp.getOperand(0);
31636 SDValue Op2 = Cmp.getOperand(1);
31637
31638 SDValue SetCC;
31639 const ConstantSDNode* C = nullptr;
31640 bool needOppositeCond = (CC == X86::COND_E);
31641 bool checkAgainstTrue = false; // Is it a comparison against 1?
31642
31643 if ((C = dyn_cast<ConstantSDNode>(Op1)))
31644 SetCC = Op2;
31645 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
31646 SetCC = Op1;
31647 else // Quit if all operands are not constants.
31648 return SDValue();
31649
31650 if (C->getZExtValue() == 1) {
31651 needOppositeCond = !needOppositeCond;
31652 checkAgainstTrue = true;
31653 } else if (C->getZExtValue() != 0)
31654 // Quit if the constant is neither 0 or 1.
31655 return SDValue();
31656
31657 bool truncatedToBoolWithAnd = false;
31658 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
31659 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
31660 SetCC.getOpcode() == ISD::TRUNCATE ||
31661 SetCC.getOpcode() == ISD::AND) {
31662 if (SetCC.getOpcode() == ISD::AND) {
31663 int OpIdx = -1;
31664 if (isOneConstant(SetCC.getOperand(0)))
31665 OpIdx = 1;
31666 if (isOneConstant(SetCC.getOperand(1)))
31667 OpIdx = 0;
31668 if (OpIdx < 0)
31669 break;
31670 SetCC = SetCC.getOperand(OpIdx);
31671 truncatedToBoolWithAnd = true;
31672 } else
31673 SetCC = SetCC.getOperand(0);
31674 }
31675
31676 switch (SetCC.getOpcode()) {
31677 case X86ISD::SETCC_CARRY:
31678 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
31679 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
31680 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
31681 // truncated to i1 using 'and'.
31682 if (checkAgainstTrue && !truncatedToBoolWithAnd)
31683 break;
31684 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31685, __extension__ __PRETTY_FUNCTION__))
31685 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 31685, __extension__ __PRETTY_FUNCTION__))
;
31686 LLVM_FALLTHROUGH[[clang::fallthrough]];
31687 case X86ISD::SETCC:
31688 // Set the condition code or opposite one if necessary.
31689 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
31690 if (needOppositeCond)
31691 CC = X86::GetOppositeBranchCondition(CC);
31692 return SetCC.getOperand(1);
31693 case X86ISD::CMOV: {
31694 // Check whether false/true value has canonical one, i.e. 0 or 1.
31695 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
31696 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
31697 // Quit if true value is not a constant.
31698 if (!TVal)
31699 return SDValue();
31700 // Quit if false value is not a constant.
31701 if (!FVal) {
31702 SDValue Op = SetCC.getOperand(0);
31703 // Skip 'zext' or 'trunc' node.
31704 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
31705 Op.getOpcode() == ISD::TRUNCATE)
31706 Op = Op.getOperand(0);
31707 // A special case for rdrand/rdseed, where 0 is set if false cond is
31708 // found.
31709 if ((Op.getOpcode() != X86ISD::RDRAND &&
31710 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
31711 return SDValue();
31712 }
31713 // Quit if false value is not the constant 0 or 1.
31714 bool FValIsFalse = true;
31715 if (FVal && FVal->getZExtValue() != 0) {
31716 if (FVal->getZExtValue() != 1)
31717 return SDValue();
31718 // If FVal is 1, opposite cond is needed.
31719 needOppositeCond = !needOppositeCond;
31720 FValIsFalse = false;
31721 }
31722 // Quit if TVal is not the constant opposite of FVal.
31723 if (FValIsFalse && TVal->getZExtValue() != 1)
31724 return SDValue();
31725 if (!FValIsFalse && TVal->getZExtValue() != 0)
31726 return SDValue();
31727 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
31728 if (needOppositeCond)
31729 CC = X86::GetOppositeBranchCondition(CC);
31730 return SetCC.getOperand(3);
31731 }
31732 }
31733
31734 return SDValue();
31735}
31736
31737/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
31738/// Match:
31739/// (X86or (X86setcc) (X86setcc))
31740/// (X86cmp (and (X86setcc) (X86setcc)), 0)
31741static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
31742 X86::CondCode &CC1, SDValue &Flags,
31743 bool &isAnd) {
31744 if (Cond->getOpcode() == X86ISD::CMP) {
31745 if (!isNullConstant(Cond->getOperand(1)))
31746 return false;
31747
31748 Cond = Cond->getOperand(0);
31749 }
31750
31751 isAnd = false;
31752
31753 SDValue SetCC0, SetCC1;
31754 switch (Cond->getOpcode()) {
31755 default: return false;
31756 case ISD::AND:
31757 case X86ISD::AND:
31758 isAnd = true;
31759 LLVM_FALLTHROUGH[[clang::fallthrough]];
31760 case ISD::OR:
31761 case X86ISD::OR:
31762 SetCC0 = Cond->getOperand(0);
31763 SetCC1 = Cond->getOperand(1);
31764 break;
31765 };
31766
31767 // Make sure we have SETCC nodes, using the same flags value.
31768 if (SetCC0.getOpcode() != X86ISD::SETCC ||
31769 SetCC1.getOpcode() != X86ISD::SETCC ||
31770 SetCC0->getOperand(1) != SetCC1->getOperand(1))
31771 return false;
31772
31773 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
31774 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
31775 Flags = SetCC0->getOperand(1);
31776 return true;
31777}
31778
31779// When legalizing carry, we create carries via add X, -1
31780// If that comes from an actual carry, via setcc, we use the
31781// carry directly.
31782static SDValue combineCarryThroughADD(SDValue EFLAGS) {
31783 if (EFLAGS.getOpcode() == X86ISD::ADD) {
31784 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
31785 SDValue Carry = EFLAGS.getOperand(0);
31786 while (Carry.getOpcode() == ISD::TRUNCATE ||
31787 Carry.getOpcode() == ISD::ZERO_EXTEND ||
31788 Carry.getOpcode() == ISD::SIGN_EXTEND ||
31789 Carry.getOpcode() == ISD::ANY_EXTEND ||
31790 (Carry.getOpcode() == ISD::AND &&
31791 isOneConstant(Carry.getOperand(1))))
31792 Carry = Carry.getOperand(0);
31793 if (Carry.getOpcode() == X86ISD::SETCC ||
31794 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
31795 if (Carry.getConstantOperandVal(0) == X86::COND_B)
31796 return Carry.getOperand(1);
31797 }
31798 }
31799 }
31800
31801 return SDValue();
31802}
31803
31804/// Optimize an EFLAGS definition used according to the condition code \p CC
31805/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
31806/// uses of chain values.
31807static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
31808 SelectionDAG &DAG,
31809 const X86Subtarget &Subtarget) {
31810 if (CC == X86::COND_B)
31811 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
31812 return Flags;
31813
31814 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
31815 return R;
31816 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
31817}
31818
31819/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
31820static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
31821 TargetLowering::DAGCombinerInfo &DCI,
31822 const X86Subtarget &Subtarget) {
31823 SDLoc DL(N);
31824
31825 SDValue FalseOp = N->getOperand(0);
31826 SDValue TrueOp = N->getOperand(1);
31827 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31828 SDValue Cond = N->getOperand(3);
31829
31830 if (CC == X86::COND_E || CC == X86::COND_NE) {
31831 switch (Cond.getOpcode()) {
31832 default: break;
31833 case X86ISD::BSR:
31834 case X86ISD::BSF:
31835 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31836 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31837 return (CC == X86::COND_E) ? FalseOp : TrueOp;
31838 }
31839 }
31840
31841 // Try to simplify the EFLAGS and condition code operands.
31842 // We can't always do this as FCMOV only supports a subset of X86 cond.
31843 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
31844 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
31845 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31846 Flags};
31847 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31848 }
31849 }
31850
31851 // If this is a select between two integer constants, try to do some
31852 // optimizations. Note that the operands are ordered the opposite of SELECT
31853 // operands.
31854 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
31855 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
31856 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31857 // larger than FalseC (the false value).
31858 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
31859 CC = X86::GetOppositeBranchCondition(CC);
31860 std::swap(TrueC, FalseC);
31861 std::swap(TrueOp, FalseOp);
31862 }
31863
31864 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
31865 // This is efficient for any integer data type (including i8/i16) and
31866 // shift amount.
31867 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31868 Cond = getSETCC(CC, Cond, DL, DAG);
31869
31870 // Zero extend the condition if needed.
31871 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31872
31873 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31874 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31875 DAG.getConstant(ShAmt, DL, MVT::i8));
31876 return Cond;
31877 }
31878
31879 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
31880 // for any integer data type, including i8/i16.
31881 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31882 Cond = getSETCC(CC, Cond, DL, DAG);
31883
31884 // Zero extend the condition if needed.
31885 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31886 FalseC->getValueType(0), Cond);
31887 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31888 SDValue(FalseC, 0));
31889 return Cond;
31890 }
31891
31892 // Optimize cases that will turn into an LEA instruction. This requires
31893 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31894 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31895 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31896 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31897
31898 bool isFastMultiplier = false;
31899 if (Diff < 10) {
31900 switch ((unsigned char)Diff) {
31901 default: break;
31902 case 1: // result = add base, cond
31903 case 2: // result = lea base( , cond*2)
31904 case 3: // result = lea base(cond, cond*2)
31905 case 4: // result = lea base( , cond*4)
31906 case 5: // result = lea base(cond, cond*4)
31907 case 8: // result = lea base( , cond*8)
31908 case 9: // result = lea base(cond, cond*8)
31909 isFastMultiplier = true;
31910 break;
31911 }
31912 }
31913
31914 if (isFastMultiplier) {
31915 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31916 Cond = getSETCC(CC, Cond, DL ,DAG);
31917 // Zero extend the condition if needed.
31918 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31919 Cond);
31920 // Scale the condition by the difference.
31921 if (Diff != 1)
31922 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31923 DAG.getConstant(Diff, DL, Cond.getValueType()));
31924
31925 // Add the base if non-zero.
31926 if (FalseC->getAPIntValue() != 0)
31927 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31928 SDValue(FalseC, 0));
31929 return Cond;
31930 }
31931 }
31932 }
31933 }
31934
31935 // Handle these cases:
31936 // (select (x != c), e, c) -> select (x != c), e, x),
31937 // (select (x == c), c, e) -> select (x == c), x, e)
31938 // where the c is an integer constant, and the "select" is the combination
31939 // of CMOV and CMP.
31940 //
31941 // The rationale for this change is that the conditional-move from a constant
31942 // needs two instructions, however, conditional-move from a register needs
31943 // only one instruction.
31944 //
31945 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31946 // some instruction-combining opportunities. This opt needs to be
31947 // postponed as late as possible.
31948 //
31949 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31950 // the DCI.xxxx conditions are provided to postpone the optimization as
31951 // late as possible.
31952
31953 ConstantSDNode *CmpAgainst = nullptr;
31954 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31955 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31956 !isa<ConstantSDNode>(Cond.getOperand(0))) {
31957
31958 if (CC == X86::COND_NE &&
31959 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31960 CC = X86::GetOppositeBranchCondition(CC);
31961 std::swap(TrueOp, FalseOp);
31962 }
31963
31964 if (CC == X86::COND_E &&
31965 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31966 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31967 DAG.getConstant(CC, DL, MVT::i8), Cond };
31968 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31969 }
31970 }
31971 }
31972
31973 // Fold and/or of setcc's to double CMOV:
31974 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31975 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31976 //
31977 // This combine lets us generate:
31978 // cmovcc1 (jcc1 if we don't have CMOV)
31979 // cmovcc2 (same)
31980 // instead of:
31981 // setcc1
31982 // setcc2
31983 // and/or
31984 // cmovne (jne if we don't have CMOV)
31985 // When we can't use the CMOV instruction, it might increase branch
31986 // mispredicts.
31987 // When we can use CMOV, or when there is no mispredict, this improves
31988 // throughput and reduces register pressure.
31989 //
31990 if (CC == X86::COND_NE) {
31991 SDValue Flags;
31992 X86::CondCode CC0, CC1;
31993 bool isAndSetCC;
31994 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31995 if (isAndSetCC) {
31996 std::swap(FalseOp, TrueOp);
31997 CC0 = X86::GetOppositeBranchCondition(CC0);
31998 CC1 = X86::GetOppositeBranchCondition(CC1);
31999 }
32000
32001 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32002 Flags};
32003 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32004 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32005 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32006 return CMOV;
32007 }
32008 }
32009
32010 return SDValue();
32011}
32012
32013/// Different mul shrinking modes.
32014enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32015
32016static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32017 EVT VT = N->getOperand(0).getValueType();
32018 if (VT.getScalarSizeInBits() != 32)
32019 return false;
32020
32021 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32021, __extension__ __PRETTY_FUNCTION__))
;
32022 unsigned SignBits[2] = {1, 1};
32023 bool IsPositive[2] = {false, false};
32024 for (unsigned i = 0; i < 2; i++) {
32025 SDValue Opd = N->getOperand(i);
32026
32027 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32028 // compute signbits for it separately.
32029 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32030 // For anyextend, it is safe to assume an appropriate number of leading
32031 // sign/zero bits.
32032 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32033 SignBits[i] = 25;
32034 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32035 MVT::i16)
32036 SignBits[i] = 17;
32037 else
32038 return false;
32039 IsPositive[i] = true;
32040 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32041 // All the operands of BUILD_VECTOR need to be int constant.
32042 // Find the smallest value range which all the operands belong to.
32043 SignBits[i] = 32;
32044 IsPositive[i] = true;
32045 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32046 if (SubOp.isUndef())
32047 continue;
32048 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32049 if (!CN)
32050 return false;
32051 APInt IntVal = CN->getAPIntValue();
32052 if (IntVal.isNegative())
32053 IsPositive[i] = false;
32054 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32055 }
32056 } else {
32057 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32058 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32059 IsPositive[i] = true;
32060 }
32061 }
32062
32063 bool AllPositive = IsPositive[0] && IsPositive[1];
32064 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32065 // When ranges are from -128 ~ 127, use MULS8 mode.
32066 if (MinSignBits >= 25)
32067 Mode = MULS8;
32068 // When ranges are from 0 ~ 255, use MULU8 mode.
32069 else if (AllPositive && MinSignBits >= 24)
32070 Mode = MULU8;
32071 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32072 else if (MinSignBits >= 17)
32073 Mode = MULS16;
32074 // When ranges are from 0 ~ 65535, use MULU16 mode.
32075 else if (AllPositive && MinSignBits >= 16)
32076 Mode = MULU16;
32077 else
32078 return false;
32079 return true;
32080}
32081
32082/// When the operands of vector mul are extended from smaller size values,
32083/// like i8 and i16, the type of mul may be shrinked to generate more
32084/// efficient code. Two typical patterns are handled:
32085/// Pattern1:
32086/// %2 = sext/zext <N x i8> %1 to <N x i32>
32087/// %4 = sext/zext <N x i8> %3 to <N x i32>
32088// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32089/// %5 = mul <N x i32> %2, %4
32090///
32091/// Pattern2:
32092/// %2 = zext/sext <N x i16> %1 to <N x i32>
32093/// %4 = zext/sext <N x i16> %3 to <N x i32>
32094/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32095/// %5 = mul <N x i32> %2, %4
32096///
32097/// There are four mul shrinking modes:
32098/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32099/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32100/// generate pmullw+sext32 for it (MULS8 mode).
32101/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32102/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32103/// generate pmullw+zext32 for it (MULU8 mode).
32104/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32105/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32106/// generate pmullw+pmulhw for it (MULS16 mode).
32107/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32108/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32109/// generate pmullw+pmulhuw for it (MULU16 mode).
32110static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32111 const X86Subtarget &Subtarget) {
32112 // Check for legality
32113 // pmullw/pmulhw are not supported by SSE.
32114 if (!Subtarget.hasSSE2())
32115 return SDValue();
32116
32117 // Check for profitability
32118 // pmulld is supported since SSE41. It is better to use pmulld
32119 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32120 // the expansion.
32121 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
32122 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32123 return SDValue();
32124
32125 ShrinkMode Mode;
32126 if (!canReduceVMulWidth(N, DAG, Mode))
32127 return SDValue();
32128
32129 SDLoc DL(N);
32130 SDValue N0 = N->getOperand(0);
32131 SDValue N1 = N->getOperand(1);
32132 EVT VT = N->getOperand(0).getValueType();
32133 unsigned NumElts = VT.getVectorNumElements();
32134 if ((NumElts % 2) != 0)
32135 return SDValue();
32136
32137 unsigned RegSize = 128;
32138 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32139 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32140
32141 // Shrink the operands of mul.
32142 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32143 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32144
32145 if (NumElts >= OpsVT.getVectorNumElements()) {
32146 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32147 // lower part is needed.
32148 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32149 if (Mode == MULU8 || Mode == MULS8) {
32150 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32151 DL, VT, MulLo);
32152 } else {
32153 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32154 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32155 // the higher part is also needed.
32156 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32157 ReducedVT, NewN0, NewN1);
32158
32159 // Repack the lower part and higher part result of mul into a wider
32160 // result.
32161 // Generate shuffle functioning as punpcklwd.
32162 SmallVector<int, 16> ShuffleMask(NumElts);
32163 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32164 ShuffleMask[2 * i] = i;
32165 ShuffleMask[2 * i + 1] = i + NumElts;
32166 }
32167 SDValue ResLo =
32168 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32169 ResLo = DAG.getBitcast(ResVT, ResLo);
32170 // Generate shuffle functioning as punpckhwd.
32171 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32172 ShuffleMask[2 * i] = i + NumElts / 2;
32173 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32174 }
32175 SDValue ResHi =
32176 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32177 ResHi = DAG.getBitcast(ResVT, ResHi);
32178 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32179 }
32180 } else {
32181 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32182 // to legalize the mul explicitly because implicit legalization for type
32183 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32184 // instructions which will not exist when we explicitly legalize it by
32185 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32186 // <4 x i16> undef).
32187 //
32188 // Legalize the operands of mul.
32189 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32190 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32191 if ((RegSize % ReducedSizeInBits) != 0)
32192 return SDValue();
32193
32194 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32195 DAG.getUNDEF(ReducedVT));
32196 Ops[0] = NewN0;
32197 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32198 Ops[0] = NewN1;
32199 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32200
32201 if (Mode == MULU8 || Mode == MULS8) {
32202 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32203 // part is needed.
32204 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32205
32206 // convert the type of mul result to VT.
32207 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32208 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32209 : ISD::SIGN_EXTEND_VECTOR_INREG,
32210 DL, ResVT, Mul);
32211 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32212 DAG.getIntPtrConstant(0, DL));
32213 } else {
32214 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32215 // MULU16/MULS16, both parts are needed.
32216 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32217 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32218 OpsVT, NewN0, NewN1);
32219
32220 // Repack the lower part and higher part result of mul into a wider
32221 // result. Make sure the type of mul result is VT.
32222 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32223 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32224 Res = DAG.getBitcast(ResVT, Res);
32225 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32226 DAG.getIntPtrConstant(0, DL));
32227 }
32228 }
32229}
32230
32231static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32232 EVT VT, SDLoc DL) {
32233
32234 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32235 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32236 DAG.getConstant(Mult, DL, VT));
32237 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32238 DAG.getConstant(Shift, DL, MVT::i8));
32239 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32240 N->getOperand(0));
32241 return Result;
32242 };
32243
32244 auto combineMulMulAddOrSub = [&](bool isAdd) {
32245 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32246 DAG.getConstant(9, DL, VT));
32247 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32248 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32249 N->getOperand(0));
32250 return Result;
32251 };
32252
32253 switch (MulAmt) {
32254 default:
32255 break;
32256 case 11:
32257 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32258 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32259 case 21:
32260 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32261 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32262 case 22:
32263 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32264 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32265 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32266 case 19:
32267 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32268 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32269 case 13:
32270 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32271 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32272 case 23:
32273 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32274 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32275 case 14:
32276 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32277 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32278 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32279 case 26:
32280 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32281 return combineMulMulAddOrSub(/*isAdd*/ false);
32282 case 28:
32283 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32284 return combineMulMulAddOrSub(/*isAdd*/ true);
32285 case 29:
32286 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32287 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32288 combineMulMulAddOrSub(/*isAdd*/ true));
32289 case 30:
32290 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32291 return DAG.getNode(
32292 ISD::SUB, DL, VT,
32293 DAG.getNode(ISD::SUB, DL, VT,
32294 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32295 DAG.getConstant(5, DL, MVT::i8)),
32296 N->getOperand(0)),
32297 N->getOperand(0));
32298 }
32299 return SDValue();
32300}
32301
32302/// Optimize a single multiply with constant into two operations in order to
32303/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32304static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32305 TargetLowering::DAGCombinerInfo &DCI,
32306 const X86Subtarget &Subtarget) {
32307 EVT VT = N->getValueType(0);
32308 if (DCI.isBeforeLegalize() && VT.isVector())
32309 return reduceVMULWidth(N, DAG, Subtarget);
32310
32311 if (!MulConstantOptimization)
32312 return SDValue();
32313 // An imul is usually smaller than the alternative sequence.
32314 if (DAG.getMachineFunction().getFunction()->optForMinSize())
32315 return SDValue();
32316
32317 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32318 return SDValue();
32319
32320 if (VT != MVT::i64 && VT != MVT::i32)
32321 return SDValue();
32322
32323 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32324 if (!C)
32325 return SDValue();
32326 uint64_t MulAmt = C->getZExtValue();
32327 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32328 return SDValue();
32329
32330 uint64_t MulAmt1 = 0;
32331 uint64_t MulAmt2 = 0;
32332 if ((MulAmt % 9) == 0) {
32333 MulAmt1 = 9;
32334 MulAmt2 = MulAmt / 9;
32335 } else if ((MulAmt % 5) == 0) {
32336 MulAmt1 = 5;
32337 MulAmt2 = MulAmt / 5;
32338 } else if ((MulAmt % 3) == 0) {
32339 MulAmt1 = 3;
32340 MulAmt2 = MulAmt / 3;
32341 }
32342
32343 SDLoc DL(N);
32344 SDValue NewMul;
32345 if (MulAmt2 &&
32346 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32347
32348 if (isPowerOf2_64(MulAmt2) &&
32349 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32350 // If second multiplifer is pow2, issue it first. We want the multiply by
32351 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32352 // is an add.
32353 std::swap(MulAmt1, MulAmt2);
32354
32355 if (isPowerOf2_64(MulAmt1))
32356 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32357 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32358 else
32359 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32360 DAG.getConstant(MulAmt1, DL, VT));
32361
32362 if (isPowerOf2_64(MulAmt2))
32363 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32364 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32365 else
32366 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32367 DAG.getConstant(MulAmt2, DL, VT));
32368 } else if (!Subtarget.slowLEA())
32369 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32370
32371 if (!NewMul) {
32372 assert(MulAmt != 0 &&(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))
32373 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))
32374 "Both cases that could cause potential overflows should have "(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))
32375 "already been handled.")(static_cast <bool> (MulAmt != 0 && MulAmt != (
VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32375, __extension__ __PRETTY_FUNCTION__))
;
32376 int64_t SignMulAmt = C->getSExtValue();
32377 if ((SignMulAmt != INT64_MIN(-9223372036854775807L -1)) && (SignMulAmt != INT64_MAX(9223372036854775807L)) &&
32378 (SignMulAmt != -INT64_MAX(9223372036854775807L))) {
32379 int NumSign = SignMulAmt > 0 ? 1 : -1;
32380 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32381 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32382 if (IsPowerOf2_64PlusOne) {
32383 // (mul x, 2^N + 1) => (add (shl x, N), x)
32384 NewMul = DAG.getNode(
32385 ISD::ADD, DL, VT, N->getOperand(0),
32386 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32387 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32388 MVT::i8)));
32389 } else if (IsPowerOf2_64MinusOne) {
32390 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32391 NewMul = DAG.getNode(
32392 ISD::SUB, DL, VT,
32393 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32394 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32395 MVT::i8)),
32396 N->getOperand(0));
32397 }
32398 // To negate, subtract the number from zero
32399 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32400 NewMul =
32401 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32402 }
32403 }
32404
32405 if (NewMul)
32406 // Do not add new nodes to DAG combiner worklist.
32407 DCI.CombineTo(N, NewMul, false);
32408
32409 return SDValue();
32410}
32411
32412static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32413 SDValue N0 = N->getOperand(0);
32414 SDValue N1 = N->getOperand(1);
32415 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32416 EVT VT = N0.getValueType();
32417
32418 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32419 // since the result of setcc_c is all zero's or all ones.
32420 if (VT.isInteger() && !VT.isVector() &&
32421 N1C && N0.getOpcode() == ISD::AND &&
32422 N0.getOperand(1).getOpcode() == ISD::Constant) {
32423 SDValue N00 = N0.getOperand(0);
32424 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32425 Mask <<= N1C->getAPIntValue();
32426 bool MaskOK = false;
32427 // We can handle cases concerning bit-widening nodes containing setcc_c if
32428 // we carefully interrogate the mask to make sure we are semantics
32429 // preserving.
32430 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32431 // of the underlying setcc_c operation if the setcc_c was zero extended.
32432 // Consider the following example:
32433 // zext(setcc_c) -> i32 0x0000FFFF
32434 // c1 -> i32 0x0000FFFF
32435 // c2 -> i32 0x00000001
32436 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32437 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32438 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32439 MaskOK = true;
32440 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32441 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32442 MaskOK = true;
32443 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32444 N00.getOpcode() == ISD::ANY_EXTEND) &&
32445 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32446 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32447 }
32448 if (MaskOK && Mask != 0) {
32449 SDLoc DL(N);
32450 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32451 }
32452 }
32453
32454 // Hardware support for vector shifts is sparse which makes us scalarize the
32455 // vector operations in many cases. Also, on sandybridge ADD is faster than
32456 // shl.
32457 // (shl V, 1) -> add V,V
32458 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32459 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32460 assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32460, __extension__ __PRETTY_FUNCTION__))
;
32461 // We shift all of the values by one. In many cases we do not have
32462 // hardware support for this operation. This is better expressed as an ADD
32463 // of two values.
32464 if (N1SplatC->getAPIntValue() == 1)
32465 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32466 }
32467
32468 return SDValue();
32469}
32470
32471static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32472 SDValue N0 = N->getOperand(0);
32473 SDValue N1 = N->getOperand(1);
32474 EVT VT = N0.getValueType();
32475 unsigned Size = VT.getSizeInBits();
32476
32477 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32478 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32479 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32480 // depending on sign of (SarConst - [56,48,32,24,16])
32481
32482 // sexts in X86 are MOVs. The MOVs have the same code size
32483 // as above SHIFTs (only SHIFT on 1 has lower code size).
32484 // However the MOVs have 2 advantages to a SHIFT:
32485 // 1. MOVs can write to a register that differs from source
32486 // 2. MOVs accept memory operands
32487
32488 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
32489 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32490 N0.getOperand(1).getOpcode() != ISD::Constant)
32491 return SDValue();
32492
32493 SDValue N00 = N0.getOperand(0);
32494 SDValue N01 = N0.getOperand(1);
32495 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32496 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32497 EVT CVT = N1.getValueType();
32498
32499 if (SarConst.isNegative())
32500 return SDValue();
32501
32502 for (MVT SVT : MVT::integer_valuetypes()) {
32503 unsigned ShiftSize = SVT.getSizeInBits();
32504 // skipping types without corresponding sext/zext and
32505 // ShlConst that is not one of [56,48,32,24,16]
32506 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
32507 continue;
32508 SDLoc DL(N);
32509 SDValue NN =
32510 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32511 SarConst = SarConst - (Size - ShiftSize);
32512 if (SarConst == 0)
32513 return NN;
32514 else if (SarConst.isNegative())
32515 return DAG.getNode(ISD::SHL, DL, VT, NN,
32516 DAG.getConstant(-SarConst, DL, CVT));
32517 else
32518 return DAG.getNode(ISD::SRA, DL, VT, NN,
32519 DAG.getConstant(SarConst, DL, CVT));
32520 }
32521 return SDValue();
32522}
32523
32524static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32525 SDValue N0 = N->getOperand(0);
32526 SDValue N1 = N->getOperand(1);
32527 EVT VT = N0.getValueType();
32528
32529 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32530 // TODO: This is a generic DAG combine that became an x86-only combine to
32531 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32532 // and-not ('andn').
32533 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32534 return SDValue();
32535
32536 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32537 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32538 if (!ShiftC || !AndC)
32539 return SDValue();
32540
32541 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32542 // transform should reduce code size. It may also enable secondary transforms
32543 // from improved known-bits analysis or instruction selection.
32544 APInt MaskVal = AndC->getAPIntValue();
32545 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32546 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32547 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32548 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32549 (OldMaskSize > 32 && NewMaskSize <= 32)) {
32550 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
32551 SDLoc DL(N);
32552 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
32553 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
32554 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
32555 }
32556 return SDValue();
32557}
32558
32559/// \brief Returns a vector of 0s if the node in input is a vector logical
32560/// shift by a constant amount which is known to be bigger than or equal
32561/// to the vector element size in bits.
32562static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
32563 const X86Subtarget &Subtarget) {
32564 EVT VT = N->getValueType(0);
32565
32566 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
32567 (!Subtarget.hasInt256() ||
32568 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
32569 return SDValue();
32570
32571 SDValue Amt = N->getOperand(1);
32572 SDLoc DL(N);
32573 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
32574 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
32575 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
32576 unsigned MaxAmount =
32577 VT.getSimpleVT().getScalarSizeInBits();
32578
32579 // SSE2/AVX2 logical shifts always return a vector of 0s
32580 // if the shift amount is bigger than or equal to
32581 // the element size. The constant shift amount will be
32582 // encoded as a 8-bit immediate.
32583 if (ShiftAmt.trunc(8).uge(MaxAmount))
32584 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
32585 }
32586
32587 return SDValue();
32588}
32589
32590static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
32591 TargetLowering::DAGCombinerInfo &DCI,
32592 const X86Subtarget &Subtarget) {
32593 if (N->getOpcode() == ISD::SHL)
32594 if (SDValue V = combineShiftLeft(N, DAG))
32595 return V;
32596
32597 if (N->getOpcode() == ISD::SRA)
32598 if (SDValue V = combineShiftRightArithmetic(N, DAG))
32599 return V;
32600
32601 if (N->getOpcode() == ISD::SRL)
32602 if (SDValue V = combineShiftRightLogical(N, DAG))
32603 return V;
32604
32605 // Try to fold this logical shift into a zero vector.
32606 if (N->getOpcode() != ISD::SRA)
32607 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
32608 return V;
32609
32610 return SDValue();
32611}
32612
32613static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
32614 TargetLowering::DAGCombinerInfo &DCI,
32615 const X86Subtarget &Subtarget) {
32616 unsigned Opcode = N->getOpcode();
32617 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected shift opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32618, __extension__ __PRETTY_FUNCTION__))
32618 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected shift opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32618, __extension__ __PRETTY_FUNCTION__))
;
32619
32620 EVT VT = N->getValueType(0);
32621 SDValue N0 = N->getOperand(0);
32622 SDValue N1 = N->getOperand(1);
32623 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
32624 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
32625 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__))
32626 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__))
32627 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32627, __extension__ __PRETTY_FUNCTION__))
;
32628
32629 // Constant Folding.
32630 APInt UndefElts0, UndefElts1;
32631 SmallVector<APInt, 32> EltBits0, EltBits1;
32632 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
32633 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
32634 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
32635 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
32636 unsigned NumLanes = VT.getSizeInBits() / 128;
32637 unsigned NumDstElts = VT.getVectorNumElements();
32638 unsigned NumSrcElts = NumDstElts / 2;
32639 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
32640 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
32641 bool IsSigned = (X86ISD::PACKSS == Opcode);
32642
32643 APInt Undefs(NumDstElts, 0);
32644 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
32645 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
32646 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
32647 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
32648 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
32649 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
32650
32651 if (UndefElts[SrcIdx]) {
32652 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
32653 continue;
32654 }
32655
32656 APInt &Val = EltBits[SrcIdx];
32657 if (IsSigned) {
32658 // PACKSS: Truncate signed value with signed saturation.
32659 // Source values less than dst minint are saturated to minint.
32660 // Source values greater than dst maxint are saturated to maxint.
32661 if (Val.isSignedIntN(DstBitsPerElt))
32662 Val = Val.trunc(DstBitsPerElt);
32663 else if (Val.isNegative())
32664 Val = APInt::getSignedMinValue(DstBitsPerElt);
32665 else
32666 Val = APInt::getSignedMaxValue(DstBitsPerElt);
32667 } else {
32668 // PACKUS: Truncate signed value with unsigned saturation.
32669 // Source values less than zero are saturated to zero.
32670 // Source values greater than dst maxuint are saturated to maxuint.
32671 if (Val.isIntN(DstBitsPerElt))
32672 Val = Val.trunc(DstBitsPerElt);
32673 else if (Val.isNegative())
32674 Val = APInt::getNullValue(DstBitsPerElt);
32675 else
32676 Val = APInt::getAllOnesValue(DstBitsPerElt);
32677 }
32678 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
32679 }
32680 }
32681
32682 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
32683 }
32684
32685 // Attempt to combine as shuffle.
32686 SDValue Op(N, 0);
32687 if (SDValue Res = combineX86ShufflesRecursively(
32688 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32689 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32690 DCI.CombineTo(N, Res);
32691 return SDValue();
32692 }
32693
32694 return SDValue();
32695}
32696
32697static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
32698 TargetLowering::DAGCombinerInfo &DCI,
32699 const X86Subtarget &Subtarget) {
32700 unsigned Opcode = N->getOpcode();
32701 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__))
32702 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__))
32703 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32703, __extension__ __PRETTY_FUNCTION__))
;
32704 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
32705 EVT VT = N->getValueType(0);
32706 SDValue N0 = N->getOperand(0);
32707 SDValue N1 = N->getOperand(1);
32708 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
32709 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __extension__ __PRETTY_FUNCTION__))
32710 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32710, __extension__ __PRETTY_FUNCTION__))
;
32711
32712 // Out of range logical bit shifts are guaranteed to be zero.
32713 // Out of range arithmetic bit shifts splat the sign bit.
32714 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
32715 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
32716 if (LogicalShift)
32717 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32718 else
32719 ShiftVal = NumBitsPerElt - 1;
32720 }
32721
32722 // Shift N0 by zero -> N0.
32723 if (!ShiftVal)
32724 return N0;
32725
32726 // Shift zero -> zero.
32727 if (ISD::isBuildVectorAllZeros(N0.getNode()))
32728 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32729
32730 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
32731 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
32732 // TODO - support other sra opcodes as needed.
32733 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
32734 N0.getOpcode() == X86ISD::VSRAI)
32735 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
32736
32737 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
32738 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
32739 N1 == N0.getOperand(1)) {
32740 SDValue N00 = N0.getOperand(0);
32741 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
32742 if (ShiftVal.ult(NumSignBits))
32743 return N00;
32744 }
32745
32746 // We can decode 'whole byte' logical bit shifts as shuffles.
32747 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
32748 SDValue Op(N, 0);
32749 if (SDValue Res = combineX86ShufflesRecursively(
32750 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32751 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32752 DCI.CombineTo(N, Res);
32753 return SDValue();
32754 }
32755 }
32756
32757 // Constant Folding.
32758 APInt UndefElts;
32759 SmallVector<APInt, 32> EltBits;
32760 if (N->isOnlyUserOf(N0.getNode()) &&
32761 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
32762 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32763, __extension__ __PRETTY_FUNCTION__))
32763 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32763, __extension__ __PRETTY_FUNCTION__))
;
32764 unsigned ShiftImm = ShiftVal.getZExtValue();
32765 for (APInt &Elt : EltBits) {
32766 if (X86ISD::VSHLI == Opcode)
32767 Elt <<= ShiftImm;
32768 else if (X86ISD::VSRAI == Opcode)
32769 Elt.ashrInPlace(ShiftImm);
32770 else
32771 Elt.lshrInPlace(ShiftImm);
32772 }
32773 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
32774 }
32775
32776 return SDValue();
32777}
32778
32779static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
32780 TargetLowering::DAGCombinerInfo &DCI,
32781 const X86Subtarget &Subtarget) {
32782 assert((static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))
32783 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))
32784 (N->getOpcode() == X86ISD::PINSRW &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))
32785 N->getValueType(0) == MVT::v8i16)) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))
32786 "Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& N->getValueType(0) == MVT::v16i8) || (N->getOpcode
() == X86ISD::PINSRW && N->getValueType(0) == MVT::
v8i16)) && "Unexpected vector insertion") ? void (0) :
__assert_fail ("((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32786, __extension__ __PRETTY_FUNCTION__))
;
32787
32788 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
32789 SDValue Op(N, 0);
32790 if (SDValue Res = combineX86ShufflesRecursively(
32791 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32792 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32793 DCI.CombineTo(N, Res);
32794 return SDValue();
32795 }
32796
32797 return SDValue();
32798}
32799
32800/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
32801/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
32802/// OR -> CMPNEQSS.
32803static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
32804 TargetLowering::DAGCombinerInfo &DCI,
32805 const X86Subtarget &Subtarget) {
32806 unsigned opcode;
32807
32808 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
32809 // we're requiring SSE2 for both.
32810 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
32811 SDValue N0 = N->getOperand(0);
32812 SDValue N1 = N->getOperand(1);
32813 SDValue CMP0 = N0->getOperand(1);
32814 SDValue CMP1 = N1->getOperand(1);
32815 SDLoc DL(N);
32816
32817 // The SETCCs should both refer to the same CMP.
32818 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
32819 return SDValue();
32820
32821 SDValue CMP00 = CMP0->getOperand(0);
32822 SDValue CMP01 = CMP0->getOperand(1);
32823 EVT VT = CMP00.getValueType();
32824
32825 if (VT == MVT::f32 || VT == MVT::f64) {
32826 bool ExpectingFlags = false;
32827 // Check for any users that want flags:
32828 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
32829 !ExpectingFlags && UI != UE; ++UI)
32830 switch (UI->getOpcode()) {
32831 default:
32832 case ISD::BR_CC:
32833 case ISD::BRCOND:
32834 case ISD::SELECT:
32835 ExpectingFlags = true;
32836 break;
32837 case ISD::CopyToReg:
32838 case ISD::SIGN_EXTEND:
32839 case ISD::ZERO_EXTEND:
32840 case ISD::ANY_EXTEND:
32841 break;
32842 }
32843
32844 if (!ExpectingFlags) {
32845 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
32846 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
32847
32848 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
32849 X86::CondCode tmp = cc0;
32850 cc0 = cc1;
32851 cc1 = tmp;
32852 }
32853
32854 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
32855 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
32856 // FIXME: need symbolic constants for these magic numbers.
32857 // See X86ATTInstPrinter.cpp:printSSECC().
32858 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
32859 if (Subtarget.hasAVX512()) {
32860 SDValue FSetCC =
32861 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
32862 DAG.getConstant(x86cc, DL, MVT::i8));
32863 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
32864 FSetCC, DAG.getIntPtrConstant(0, DL));
32865 }
32866 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
32867 CMP00.getValueType(), CMP00, CMP01,
32868 DAG.getConstant(x86cc, DL,
32869 MVT::i8));
32870
32871 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
32872 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
32873
32874 if (is64BitFP && !Subtarget.is64Bit()) {
32875 // On a 32-bit target, we cannot bitcast the 64-bit float to a
32876 // 64-bit integer, since that's not a legal type. Since
32877 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
32878 // bits, but can do this little dance to extract the lowest 32 bits
32879 // and work with those going forward.
32880 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
32881 OnesOrZeroesF);
32882 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
32883 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
32884 Vector32, DAG.getIntPtrConstant(0, DL));
32885 IntVT = MVT::i32;
32886 }
32887
32888 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
32889 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
32890 DAG.getConstant(1, DL, IntVT));
32891 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
32892 ANDed);
32893 return OneBitOfTruth;
32894 }
32895 }
32896 }
32897 }
32898 return SDValue();
32899}
32900
32901/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
32902static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
32903 assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32903, __extension__ __PRETTY_FUNCTION__))
;
32904
32905 EVT VT = N->getValueType(0);
32906 SDValue N0 = N->getOperand(0);
32907 SDValue N1 = N->getOperand(1);
32908 SDLoc DL(N);
32909
32910 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
32911 return SDValue();
32912
32913 if (N0.getOpcode() == ISD::XOR &&
32914 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
32915 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
32916
32917 if (N1.getOpcode() == ISD::XOR &&
32918 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
32919 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
32920
32921 return SDValue();
32922}
32923
32924// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
32925// register. In most cases we actually compare or select YMM-sized registers
32926// and mixing the two types creates horrible code. This method optimizes
32927// some of the transition sequences.
32928static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
32929 TargetLowering::DAGCombinerInfo &DCI,
32930 const X86Subtarget &Subtarget) {
32931 EVT VT = N->getValueType(0);
32932 if (!VT.is256BitVector())
32933 return SDValue();
32934
32935 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__))
32936 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__))
32937 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 32937, __extension__ __PRETTY_FUNCTION__))
;
32938
32939 SDValue Narrow = N->getOperand(0);
32940 EVT NarrowVT = Narrow->getValueType(0);
32941 if (!NarrowVT.is128BitVector())
32942 return SDValue();
32943
32944 if (Narrow->getOpcode() != ISD::XOR &&
32945 Narrow->getOpcode() != ISD::AND &&
32946 Narrow->getOpcode() != ISD::OR)
32947 return SDValue();
32948
32949 SDValue N0 = Narrow->getOperand(0);
32950 SDValue N1 = Narrow->getOperand(1);
32951 SDLoc DL(Narrow);
32952
32953 // The Left side has to be a trunc.
32954 if (N0.getOpcode() != ISD::TRUNCATE)
32955 return SDValue();
32956
32957 // The type of the truncated inputs.
32958 EVT WideVT = N0->getOperand(0)->getValueType(0);
32959 if (WideVT != VT)
32960 return SDValue();
32961
32962 // The right side has to be a 'trunc' or a constant vector.
32963 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
32964 ConstantSDNode *RHSConstSplat = nullptr;
32965 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
32966 RHSConstSplat = RHSBV->getConstantSplatNode();
32967 if (!RHSTrunc && !RHSConstSplat)
32968 return SDValue();
32969
32970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32971
32972 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
32973 return SDValue();
32974
32975 // Set N0 and N1 to hold the inputs to the new wide operation.
32976 N0 = N0->getOperand(0);
32977 if (RHSConstSplat) {
32978 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
32979 SDValue(RHSConstSplat, 0));
32980 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
32981 } else if (RHSTrunc) {
32982 N1 = N1->getOperand(0);
32983 }
32984
32985 // Generate the wide operation.
32986 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
32987 unsigned Opcode = N->getOpcode();
32988 switch (Opcode) {
32989 case ISD::ANY_EXTEND:
32990 return Op;
32991 case ISD::ZERO_EXTEND: {
32992 unsigned InBits = NarrowVT.getScalarSizeInBits();
32993 APInt Mask = APInt::getAllOnesValue(InBits);
32994 Mask = Mask.zext(VT.getScalarSizeInBits());
32995 return DAG.getNode(ISD::AND, DL, VT,
32996 Op, DAG.getConstant(Mask, DL, VT));
32997 }
32998 case ISD::SIGN_EXTEND:
32999 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33000 Op, DAG.getValueType(NarrowVT));
33001 default:
33002 llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33002)
;
33003 }
33004}
33005
33006/// If both input operands of a logic op are being cast from floating point
33007/// types, try to convert this into a floating point logic node to avoid
33008/// unnecessary moves from SSE to integer registers.
33009static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33010 const X86Subtarget &Subtarget) {
33011 unsigned FPOpcode = ISD::DELETED_NODE;
33012 if (N->getOpcode() == ISD::AND)
33013 FPOpcode = X86ISD::FAND;
33014 else if (N->getOpcode() == ISD::OR)
33015 FPOpcode = X86ISD::FOR;
33016 else if (N->getOpcode() == ISD::XOR)
33017 FPOpcode = X86ISD::FXOR;
33018
33019 assert(FPOpcode != ISD::DELETED_NODE &&(static_cast <bool> (FPOpcode != ISD::DELETED_NODE &&
"Unexpected input node for FP logic conversion") ? void (0) :
__assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33020, __extension__ __PRETTY_FUNCTION__))
33020 "Unexpected input node for FP logic conversion")(static_cast <bool> (FPOpcode != ISD::DELETED_NODE &&
"Unexpected input node for FP logic conversion") ? void (0) :
__assert_fail ("FPOpcode != ISD::DELETED_NODE && \"Unexpected input node for FP logic conversion\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33020, __extension__ __PRETTY_FUNCTION__))
;
33021
33022 EVT VT = N->getValueType(0);
33023 SDValue N0 = N->getOperand(0);
33024 SDValue N1 = N->getOperand(1);
33025 SDLoc DL(N);
33026 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33027 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33028 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33029 SDValue N00 = N0.getOperand(0);
33030 SDValue N10 = N1.getOperand(0);
33031 EVT N00Type = N00.getValueType();
33032 EVT N10Type = N10.getValueType();
33033 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33034 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33035 return DAG.getBitcast(VT, FPLogic);
33036 }
33037 }
33038 return SDValue();
33039}
33040
33041/// If this is a zero/all-bits result that is bitwise-anded with a low bits
33042/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33043/// with a shift-right to eliminate loading the vector constant mask value.
33044static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33045 const X86Subtarget &Subtarget) {
33046 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33047 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33048 EVT VT0 = Op0.getValueType();
33049 EVT VT1 = Op1.getValueType();
33050
33051 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33052 return SDValue();
33053
33054 APInt SplatVal;
33055 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33056 !SplatVal.isMask())
33057 return SDValue();
33058
33059 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33060 return SDValue();
33061
33062 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33063 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33064 return SDValue();
33065
33066 SDLoc DL(N);
33067 unsigned ShiftVal = SplatVal.countTrailingOnes();
33068 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33069 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33070 return DAG.getBitcast(N->getValueType(0), Shift);
33071}
33072
33073// Get the index node from the lowered DAG of a GEP IR instruction with one
33074// indexing dimension.
33075static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33076 if (Ld->isIndexed())
33077 return SDValue();
33078
33079 SDValue Base = Ld->getBasePtr();
33080
33081 if (Base.getOpcode() != ISD::ADD)
33082 return SDValue();
33083
33084 SDValue ShiftedIndex = Base.getOperand(0);
33085
33086 if (ShiftedIndex.getOpcode() != ISD::SHL)
33087 return SDValue();
33088
33089 return ShiftedIndex.getOperand(0);
33090
33091}
33092
33093static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33094 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33095 switch (VT.getSizeInBits()) {
33096 default: return false;
33097 case 64: return Subtarget.is64Bit() ? true : false;
33098 case 32: return true;
33099 }
33100 }
33101 return false;
33102}
33103
33104// This function recognizes cases where X86 bzhi instruction can replace and
33105// 'and-load' sequence.
33106// In case of loading integer value from an array of constants which is defined
33107// as follows:
33108//
33109// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33110//
33111// then applying a bitwise and on the result with another input.
33112// It's equivalent to performing bzhi (zero high bits) on the input, with the
33113// same index of the load.
33114static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33115 const X86Subtarget &Subtarget) {
33116 MVT VT = Node->getSimpleValueType(0);
33117 SDLoc dl(Node);
33118
33119 // Check if subtarget has BZHI instruction for the node's type
33120 if (!hasBZHI(Subtarget, VT))
33121 return SDValue();
33122
33123 // Try matching the pattern for both operands.
33124 for (unsigned i = 0; i < 2; i++) {
33125 SDValue N = Node->getOperand(i);
33126 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33127
33128 // continue if the operand is not a load instruction
33129 if (!Ld)
33130 return SDValue();
33131
33132 const Value *MemOp = Ld->getMemOperand()->getValue();
33133
33134 if (!MemOp)
33135 return SDValue();
33136
33137 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33138 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33139 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33140
33141 Constant *Init = GV->getInitializer();
33142 Type *Ty = Init->getType();
33143 if (!isa<ConstantDataArray>(Init) ||
33144 !Ty->getArrayElementType()->isIntegerTy() ||
33145 Ty->getArrayElementType()->getScalarSizeInBits() !=
33146 VT.getSizeInBits() ||
33147 Ty->getArrayNumElements() >
33148 Ty->getArrayElementType()->getScalarSizeInBits())
33149 continue;
33150
33151 // Check if the array's constant elements are suitable to our case.
33152 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
33153 bool ConstantsMatch = true;
33154 for (uint64_t j = 0; j < ArrayElementCount; j++) {
33155 ConstantInt *Elem =
33156 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
33157 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
33158 ConstantsMatch = false;
33159 break;
33160 }
33161 }
33162 if (!ConstantsMatch)
33163 continue;
33164
33165 // Do the transformation (For 32-bit type):
33166 // -> (and (load arr[idx]), inp)
33167 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
33168 // that will be replaced with one bzhi instruction.
33169 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
33170 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
33171
33172 // Get the Node which indexes into the array.
33173 SDValue Index = getIndexFromUnindexedLoad(Ld);
33174 if (!Index)
33175 return SDValue();
33176 Index = DAG.getZExtOrTrunc(Index, dl, VT);
33177
33178 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
33179
33180 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
33181 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
33182
33183 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
33184 }
33185 }
33186 }
33187 }
33188 return SDValue();
33189}
33190
33191static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
33192 TargetLowering::DAGCombinerInfo &DCI,
33193 const X86Subtarget &Subtarget) {
33194 EVT VT = N->getValueType(0);
33195
33196 // If this is SSE1 only convert to FAND to avoid scalarization.
33197 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33198 return DAG.getBitcast(
33199 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
33200 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
33201 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
33202 }
33203
33204 if (DCI.isBeforeLegalizeOps())
33205 return SDValue();
33206
33207 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33208 return R;
33209
33210 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33211 return FPLogic;
33212
33213 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33214 return R;
33215
33216 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33217 return ShiftRight;
33218
33219 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
33220 return R;
33221
33222 // Attempt to recursively combine a bitmask AND with shuffles.
33223 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33224 SDValue Op(N, 0);
33225 if (SDValue Res = combineX86ShufflesRecursively(
33226 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33227 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33228 DCI.CombineTo(N, Res);
33229 return SDValue();
33230 }
33231 }
33232
33233 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33234 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33235 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33236 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33237 SDValue BitMask = N->getOperand(1);
33238 SDValue SrcVec = N->getOperand(0).getOperand(0);
33239 EVT SrcVecVT = SrcVec.getValueType();
33240
33241 // Check that the constant bitmask masks whole bytes.
33242 APInt UndefElts;
33243 SmallVector<APInt, 64> EltBits;
33244 if (VT == SrcVecVT.getScalarType() &&
33245 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33246 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33247 llvm::all_of(EltBits, [](APInt M) {
33248 return M.isNullValue() || M.isAllOnesValue();
33249 })) {
33250 unsigned NumElts = SrcVecVT.getVectorNumElements();
33251 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33252 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33253
33254 // Create a root shuffle mask from the byte mask and the extracted index.
33255 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33256 for (unsigned i = 0; i != Scale; ++i) {
33257 if (UndefElts[i])
33258 continue;
33259 int VecIdx = Scale * Idx + i;
33260 ShuffleMask[VecIdx] =
33261 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33262 }
33263
33264 if (SDValue Shuffle = combineX86ShufflesRecursively(
33265 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33266 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33267 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33268 N->getOperand(0).getOperand(1));
33269 }
33270 }
33271
33272 return SDValue();
33273}
33274
33275// Try to fold:
33276// (or (and (m, y), (pandn m, x)))
33277// into:
33278// (vselect m, x, y)
33279// As a special case, try to fold:
33280// (or (and (m, (sub 0, x)), (pandn m, x)))
33281// into:
33282// (sub (xor X, M), M)
33283static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33284 const X86Subtarget &Subtarget) {
33285 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33285, __extension__ __PRETTY_FUNCTION__))
;
33286
33287 SDValue N0 = N->getOperand(0);
33288 SDValue N1 = N->getOperand(1);
33289 EVT VT = N->getValueType(0);
33290
33291 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33292 (VT.is256BitVector() && Subtarget.hasInt256())))
33293 return SDValue();
33294
33295 // Canonicalize AND to LHS.
33296 if (N1.getOpcode() == ISD::AND)
33297 std::swap(N0, N1);
33298
33299 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33300 // ANDNP combine allows other combines to happen that prevent matching.
33301 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33302 return SDValue();
33303
33304 SDValue Mask = N1.getOperand(0);
33305 SDValue X = N1.getOperand(1);
33306 SDValue Y;
33307 if (N0.getOperand(0) == Mask)
33308 Y = N0.getOperand(1);
33309 if (N0.getOperand(1) == Mask)
33310 Y = N0.getOperand(0);
33311
33312 // Check to see if the mask appeared in both the AND and ANDNP.
33313 if (!Y.getNode())
33314 return SDValue();
33315
33316 // Validate that X, Y, and Mask are bitcasts, and see through them.
33317 Mask = peekThroughBitcasts(Mask);
33318 X = peekThroughBitcasts(X);
33319 Y = peekThroughBitcasts(Y);
33320
33321 EVT MaskVT = Mask.getValueType();
33322 unsigned EltBits = MaskVT.getScalarSizeInBits();
33323
33324 // TODO: Attempt to handle floating point cases as well?
33325 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33326 return SDValue();
33327
33328 SDLoc DL(N);
33329
33330 // Try to match:
33331 // (or (and (M, (sub 0, X)), (pandn M, X)))
33332 // which is a special case of vselect:
33333 // (vselect M, (sub 0, X), X)
33334 // Per:
33335 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33336 // We know that, if fNegate is 0 or 1:
33337 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33338 //
33339 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33340 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33341 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33342 // This lets us transform our vselect to:
33343 // (add (xor X, M), (and M, 1))
33344 // And further to:
33345 // (sub (xor X, M), M)
33346 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33347 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33348 auto IsNegV = [](SDNode *N, SDValue V) {
33349 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33350 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33351 };
33352 SDValue V;
33353 if (IsNegV(Y.getNode(), X))
33354 V = X;
33355 else if (IsNegV(X.getNode(), Y))
33356 V = Y;
33357
33358 if (V) {
33359 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33360 SDValue SubOp2 = Mask;
33361
33362 // If the negate was on the false side of the select, then
33363 // the operands of the SUB need to be swapped. PR 27251.
33364 // This is because the pattern being matched above is
33365 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33366 // but if the pattern matched was
33367 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33368 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33369 // pattern also needs to be a negation of the replacement pattern above.
33370 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33371 // sub accomplishes the negation of the replacement pattern.
33372 if (V == Y)
33373 std::swap(SubOp1, SubOp2);
33374
33375 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33376 return DAG.getBitcast(VT, Res);
33377 }
33378 }
33379
33380 // PBLENDVB is only available on SSE 4.1.
33381 if (!Subtarget.hasSSE41())
33382 return SDValue();
33383
33384 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33385
33386 X = DAG.getBitcast(BlendVT, X);
33387 Y = DAG.getBitcast(BlendVT, Y);
33388 Mask = DAG.getBitcast(BlendVT, Mask);
33389 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33390 return DAG.getBitcast(VT, Mask);
33391}
33392
33393// Helper function for combineOrCmpEqZeroToCtlzSrl
33394// Transforms:
33395// seteq(cmp x, 0)
33396// into:
33397// srl(ctlz x), log2(bitsize(x))
33398// Input pattern is checked by caller.
33399static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33400 SelectionDAG &DAG) {
33401 SDValue Cmp = Op.getOperand(1);
33402 EVT VT = Cmp.getOperand(0).getValueType();
33403 unsigned Log2b = Log2_32(VT.getSizeInBits());
33404 SDLoc dl(Op);
33405 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33406 // The result of the shift is true or false, and on X86, the 32-bit
33407 // encoding of shr and lzcnt is more desirable.
33408 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33409 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33410 DAG.getConstant(Log2b, dl, VT));
33411 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33412}
33413
33414// Try to transform:
33415// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33416// into:
33417// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33418// Will also attempt to match more generic cases, eg:
33419// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33420// Only applies if the target supports the FastLZCNT feature.
33421static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33422 TargetLowering::DAGCombinerInfo &DCI,
33423 const X86Subtarget &Subtarget) {
33424 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33425 return SDValue();
33426
33427 auto isORCandidate = [](SDValue N) {
33428 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33429 };
33430
33431 // Check the zero extend is extending to 32-bit or more. The code generated by
33432 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33433 // instructions to clear the upper bits.
33434 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33435 !isORCandidate(N->getOperand(0)))
33436 return SDValue();
33437
33438 // Check the node matches: setcc(eq, cmp 0)
33439 auto isSetCCCandidate = [](SDValue N) {
33440 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33441 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33442 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33443 isNullConstant(N->getOperand(1).getOperand(1)) &&
33444 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33445 };
33446
33447 SDNode *OR = N->getOperand(0).getNode();
33448 SDValue LHS = OR->getOperand(0);
33449 SDValue RHS = OR->getOperand(1);
33450
33451 // Save nodes matching or(or, setcc(eq, cmp 0)).
33452 SmallVector<SDNode *, 2> ORNodes;
33453 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33454 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33455 ORNodes.push_back(OR);
33456 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33457 LHS = OR->getOperand(0);
33458 RHS = OR->getOperand(1);
33459 }
33460
33461 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33462 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33463 !isORCandidate(SDValue(OR, 0)))
33464 return SDValue();
33465
33466 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33467 // to
33468 // or(srl(ctlz),srl(ctlz)).
33469 // The dag combiner can then fold it into:
33470 // srl(or(ctlz, ctlz)).
33471 EVT VT = OR->getValueType(0);
33472 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33473 SDValue Ret, NewRHS;
33474 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33475 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33476
33477 if (!Ret)
33478 return SDValue();
33479
33480 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33481 while (ORNodes.size() > 0) {
33482 OR = ORNodes.pop_back_val();
33483 LHS = OR->getOperand(0);
33484 RHS = OR->getOperand(1);
33485 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33486 if (RHS->getOpcode() == ISD::OR)
33487 std::swap(LHS, RHS);
33488 EVT VT = OR->getValueType(0);
33489 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33490 if (!NewRHS)
33491 return SDValue();
33492 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33493 }
33494
33495 if (Ret)
33496 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33497
33498 return Ret;
33499}
33500
33501static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33502 TargetLowering::DAGCombinerInfo &DCI,
33503 const X86Subtarget &Subtarget) {
33504 SDValue N0 = N->getOperand(0);
33505 SDValue N1 = N->getOperand(1);
33506 EVT VT = N->getValueType(0);
33507
33508 // If this is SSE1 only convert to FOR to avoid scalarization.
33509 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33510 return DAG.getBitcast(MVT::v4i32,
33511 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33512 DAG.getBitcast(MVT::v4f32, N0),
33513 DAG.getBitcast(MVT::v4f32, N1)));
33514 }
33515
33516 if (DCI.isBeforeLegalizeOps())
33517 return SDValue();
33518
33519 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33520 return R;
33521
33522 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33523 return FPLogic;
33524
33525 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33526 return R;
33527
33528 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33529 return SDValue();
33530
33531 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33532 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
33533
33534 // SHLD/SHRD instructions have lower register pressure, but on some
33535 // platforms they have higher latency than the equivalent
33536 // series of shifts/or that would otherwise be generated.
33537 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33538 // have higher latencies and we are not optimizing for size.
33539 if (!OptForSize && Subtarget.isSHLDSlow())
33540 return SDValue();
33541
33542 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33543 std::swap(N0, N1);
33544 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33545 return SDValue();
33546 if (!N0.hasOneUse() || !N1.hasOneUse())
33547 return SDValue();
33548
33549 SDValue ShAmt0 = N0.getOperand(1);
33550 if (ShAmt0.getValueType() != MVT::i8)
33551 return SDValue();
33552 SDValue ShAmt1 = N1.getOperand(1);
33553 if (ShAmt1.getValueType() != MVT::i8)
33554 return SDValue();
33555 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33556 ShAmt0 = ShAmt0.getOperand(0);
33557 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33558 ShAmt1 = ShAmt1.getOperand(0);
33559
33560 SDLoc DL(N);
33561 unsigned Opc = X86ISD::SHLD;
33562 SDValue Op0 = N0.getOperand(0);
33563 SDValue Op1 = N1.getOperand(0);
33564 if (ShAmt0.getOpcode() == ISD::SUB ||
33565 ShAmt0.getOpcode() == ISD::XOR) {
33566 Opc = X86ISD::SHRD;
33567 std::swap(Op0, Op1);
33568 std::swap(ShAmt0, ShAmt1);
33569 }
33570
33571 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33572 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33573 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33574 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33575 unsigned Bits = VT.getSizeInBits();
33576 if (ShAmt1.getOpcode() == ISD::SUB) {
33577 SDValue Sum = ShAmt1.getOperand(0);
33578 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33579 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33580 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33581 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33582 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33583 return DAG.getNode(Opc, DL, VT,
33584 Op0, Op1,
33585 DAG.getNode(ISD::TRUNCATE, DL,
33586 MVT::i8, ShAmt0));
33587 }
33588 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33589 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33590 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33591 return DAG.getNode(Opc, DL, VT,
33592 N0.getOperand(0), N1.getOperand(0),
33593 DAG.getNode(ISD::TRUNCATE, DL,
33594 MVT::i8, ShAmt0));
33595 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33596 SDValue Mask = ShAmt1.getOperand(1);
33597 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
33598 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
33599 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
33600 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
33601 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
33602 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
33603 if (Op1.getOpcode() == InnerShift &&
33604 isa<ConstantSDNode>(Op1.getOperand(1)) &&
33605 Op1.getConstantOperandVal(1) == 1) {
33606 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33607 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33608 }
33609 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
33610 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
33611 Op1.getOperand(0) == Op1.getOperand(1)) {
33612 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33613 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33614 }
33615 }
33616 }
33617 }
33618
33619 return SDValue();
33620}
33621
33622/// Try to turn tests against the signbit in the form of:
33623/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
33624/// into:
33625/// SETGT(X, -1)
33626static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
33627 // This is only worth doing if the output type is i8 or i1.
33628 EVT ResultType = N->getValueType(0);
33629 if (ResultType != MVT::i8 && ResultType != MVT::i1)
33630 return SDValue();
33631
33632 SDValue N0 = N->getOperand(0);
33633 SDValue N1 = N->getOperand(1);
33634
33635 // We should be performing an xor against a truncated shift.
33636 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
33637 return SDValue();
33638
33639 // Make sure we are performing an xor against one.
33640 if (!isOneConstant(N1))
33641 return SDValue();
33642
33643 // SetCC on x86 zero extends so only act on this if it's a logical shift.
33644 SDValue Shift = N0.getOperand(0);
33645 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
33646 return SDValue();
33647
33648 // Make sure we are truncating from one of i16, i32 or i64.
33649 EVT ShiftTy = Shift.getValueType();
33650 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
33651 return SDValue();
33652
33653 // Make sure the shift amount extracts the sign bit.
33654 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
33655 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
33656 return SDValue();
33657
33658 // Create a greater-than comparison against -1.
33659 // N.B. Using SETGE against 0 works but we want a canonical looking
33660 // comparison, using SETGT matches up with what TranslateX86CC.
33661 SDLoc DL(N);
33662 SDValue ShiftOp = Shift.getOperand(0);
33663 EVT ShiftOpTy = ShiftOp.getValueType();
33664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33665 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
33666 *DAG.getContext(), ResultType);
33667 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
33668 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
33669 if (SetCCResultType != ResultType)
33670 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
33671 return Cond;
33672}
33673
33674/// Turn vector tests of the signbit in the form of:
33675/// xor (sra X, elt_size(X)-1), -1
33676/// into:
33677/// pcmpgt X, -1
33678///
33679/// This should be called before type legalization because the pattern may not
33680/// persist after that.
33681static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
33682 const X86Subtarget &Subtarget) {
33683 EVT VT = N->getValueType(0);
33684 if (!VT.isSimple())
33685 return SDValue();
33686
33687 switch (VT.getSimpleVT().SimpleTy) {
33688 default: return SDValue();
33689 case MVT::v16i8:
33690 case MVT::v8i16:
33691 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
33692 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
33693 case MVT::v32i8:
33694 case MVT::v16i16:
33695 case MVT::v8i32:
33696 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
33697 }
33698
33699 // There must be a shift right algebraic before the xor, and the xor must be a
33700 // 'not' operation.
33701 SDValue Shift = N->getOperand(0);
33702 SDValue Ones = N->getOperand(1);
33703 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
33704 !ISD::isBuildVectorAllOnes(Ones.getNode()))
33705 return SDValue();
33706
33707 // The shift should be smearing the sign bit across each vector element.
33708 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
33709 if (!ShiftBV)
33710 return SDValue();
33711
33712 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
33713 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
33714 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
33715 return SDValue();
33716
33717 // Create a greater-than comparison against -1. We don't use the more obvious
33718 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
33719 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
33720}
33721
33722/// Check if truncation with saturation form type \p SrcVT to \p DstVT
33723/// is valid for the given \p Subtarget.
33724static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
33725 const X86Subtarget &Subtarget) {
33726 if (!Subtarget.hasAVX512())
33727 return false;
33728
33729 // FIXME: Scalar type may be supported if we move it to vector register.
33730 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
33731 return false;
33732
33733 EVT SrcElVT = SrcVT.getScalarType();
33734 EVT DstElVT = DstVT.getScalarType();
33735 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
33736 return false;
33737 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
33738 return false;
33739 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
33740 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
33741 return false;
33742}
33743
33744/// Detect a pattern of truncation with saturation:
33745/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33746/// Return the source value to be truncated or SDValue() if the pattern was not
33747/// matched.
33748static SDValue detectUSatPattern(SDValue In, EVT VT) {
33749 if (In.getOpcode() != ISD::UMIN)
33750 return SDValue();
33751
33752 //Saturation with truncation. We truncate from InVT to VT.
33753 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (In.getScalarValueSizeInBits() >
VT.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33754, __extension__ __PRETTY_FUNCTION__))
33754 "Unexpected types for truncate operation")(static_cast <bool> (In.getScalarValueSizeInBits() >
VT.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 33754, __extension__ __PRETTY_FUNCTION__))
;
33755
33756 APInt C;
33757 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
33758 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
33759 // the element size of the destination type.
33760 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
33761 SDValue();
33762 }
33763 return SDValue();
33764}
33765
33766/// Detect a pattern of truncation with saturation:
33767/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33768/// The types should allow to use VPMOVUS* instruction on AVX512.
33769/// Return the source value to be truncated or SDValue() if the pattern was not
33770/// matched.
33771static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
33772 const X86Subtarget &Subtarget) {
33773 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33774 return SDValue();
33775 return detectUSatPattern(In, VT);
33776}
33777
33778static SDValue
33779combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
33780 const X86Subtarget &Subtarget) {
33781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33782 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
33783 return SDValue();
33784 if (auto USatVal = detectUSatPattern(In, VT))
33785 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33786 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
33787 return SDValue();
33788}
33789
33790/// This function detects the AVG pattern between vectors of unsigned i8/i16,
33791/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
33792/// X86ISD::AVG instruction.
33793static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
33794 const X86Subtarget &Subtarget,
33795 const SDLoc &DL) {
33796 if (!VT.isVector() || !VT.isSimple())
33797 return SDValue();
33798 EVT InVT = In.getValueType();
33799 unsigned NumElems = VT.getVectorNumElements();
33800
33801 EVT ScalarVT = VT.getVectorElementType();
33802 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
33803 isPowerOf2_32(NumElems)))
33804 return SDValue();
33805
33806 // InScalarVT is the intermediate type in AVG pattern and it should be greater
33807 // than the original input type (i8/i16).
33808 EVT InScalarVT = InVT.getVectorElementType();
33809 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
33810 return SDValue();
33811
33812 if (!Subtarget.hasSSE2())
33813 return SDValue();
33814 if (Subtarget.hasBWI()) {
33815 if (VT.getSizeInBits() > 512)
33816 return SDValue();
33817 } else if (Subtarget.hasAVX2()) {
33818 if (VT.getSizeInBits() > 256)
33819 return SDValue();
33820 } else {
33821 if (VT.getSizeInBits() > 128)
33822 return SDValue();
33823 }
33824
33825 // Detect the following pattern:
33826 //
33827 // %1 = zext <N x i8> %a to <N x i32>
33828 // %2 = zext <N x i8> %b to <N x i32>
33829 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
33830 // %4 = add nuw nsw <N x i32> %3, %2
33831 // %5 = lshr <N x i32> %N, <i32 1 x N>
33832 // %6 = trunc <N x i32> %5 to <N x i8>
33833 //
33834 // In AVX512, the last instruction can also be a trunc store.
33835
33836 if (In.getOpcode() != ISD::SRL)
33837 return SDValue();
33838
33839 // A lambda checking the given SDValue is a constant vector and each element
33840 // is in the range [Min, Max].
33841 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
33842 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
33843 if (!BV || !BV->isConstant())
33844 return false;
33845 for (SDValue Op : V->ops()) {
33846 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
33847 if (!C)
33848 return false;
33849 uint64_t Val = C->getZExtValue();
33850 if (Val < Min || Val > Max)
33851 return false;
33852 }
33853 return true;
33854 };
33855
33856 // Check if each element of the vector is left-shifted by one.
33857 auto LHS = In.getOperand(0);
33858 auto RHS = In.getOperand(1);
33859 if (!IsConstVectorInRange(RHS, 1, 1))
33860 return SDValue();
33861 if (LHS.getOpcode() != ISD::ADD)
33862 return SDValue();
33863
33864 // Detect a pattern of a + b + 1 where the order doesn't matter.
33865 SDValue Operands[3];
33866 Operands[0] = LHS.getOperand(0);
33867 Operands[1] = LHS.getOperand(1);
33868
33869 // Take care of the case when one of the operands is a constant vector whose
33870 // element is in the range [1, 256].
33871 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
33872 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
33873 Operands[0].getOperand(0).getValueType() == VT) {
33874 // The pattern is detected. Subtract one from the constant vector, then
33875 // demote it and emit X86ISD::AVG instruction.
33876 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
33877 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
33878 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
33879 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33880 Operands[1]);
33881 }
33882
33883 if (Operands[0].getOpcode() == ISD::ADD)
33884 std::swap(Operands[0], Operands[1]);
33885 else if (Operands[1].getOpcode() != ISD::ADD)
33886 return SDValue();
33887 Operands[2] = Operands[1].getOperand(0);
33888 Operands[1] = Operands[1].getOperand(1);
33889
33890 // Now we have three operands of two additions. Check that one of them is a
33891 // constant vector with ones, and the other two are promoted from i8/i16.
33892 for (int i = 0; i < 3; ++i) {
33893 if (!IsConstVectorInRange(Operands[i], 1, 1))
33894 continue;
33895 std::swap(Operands[i], Operands[2]);
33896
33897 // Check if Operands[0] and Operands[1] are results of type promotion.
33898 for (int j = 0; j < 2; ++j)
33899 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
33900 Operands[j].getOperand(0).getValueType() != VT)
33901 return SDValue();
33902
33903 // The pattern is detected, emit X86ISD::AVG instruction.
33904 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33905 Operands[1].getOperand(0));
33906 }
33907
33908 return SDValue();
33909}
33910
33911static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
33912 TargetLowering::DAGCombinerInfo &DCI,
33913 const X86Subtarget &Subtarget) {
33914 LoadSDNode *Ld = cast<LoadSDNode>(N);
33915 EVT RegVT = Ld->getValueType(0);
33916 EVT MemVT = Ld->getMemoryVT();
33917 SDLoc dl(Ld);
33918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33919
33920 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
33921 // into two 16-byte operations. Also split non-temporal aligned loads on
33922 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
33923 ISD::LoadExtType Ext = Ld->getExtensionType();
33924 bool Fast;
33925 unsigned AddressSpace = Ld->getAddressSpace();
33926 unsigned Alignment = Ld->getAlignment();
33927 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
33928 Ext == ISD::NON_EXTLOAD &&
33929 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
33930 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
33931 AddressSpace, Alignment, &Fast) && !Fast))) {
33932 unsigned NumElems = RegVT.getVectorNumElements();
33933 if (NumElems < 2)
33934 return SDValue();
33935
33936 SDValue Ptr = Ld->getBasePtr();
33937
33938 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
33939 NumElems/2);
33940 SDValue Load1 =
33941 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
33942 Alignment, Ld->getMemOperand()->getFlags());
33943
33944 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
33945 SDValue Load2 =
33946 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
33947 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
33948 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33949 Load1.getValue(1),
33950 Load2.getValue(1));
33951
33952 SDValue NewVec = DAG.getUNDEF(RegVT);
33953 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
33954 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
33955 return DCI.CombineTo(N, NewVec, TF, true);
33956 }
33957
33958 return SDValue();
33959}
33960
33961/// If V is a build vector of boolean constants and exactly one of those
33962/// constants is true, return the operand index of that true element.
33963/// Otherwise, return -1.
33964static int getOneTrueElt(SDValue V) {
33965 // This needs to be a build vector of booleans.
33966 // TODO: Checking for the i1 type matches the IR definition for the mask,
33967 // but the mask check could be loosened to i8 or other types. That might
33968 // also require checking more than 'allOnesValue'; eg, the x86 HW
33969 // instructions only require that the MSB is set for each mask element.
33970 // The ISD::MSTORE comments/definition do not specify how the mask operand
33971 // is formatted.
33972 auto *BV = dyn_cast<BuildVectorSDNode>(V);
33973 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
33974 return -1;
33975
33976 int TrueIndex = -1;
33977 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
33978 for (unsigned i = 0; i < NumElts; ++i) {
33979 const SDValue &Op = BV->getOperand(i);
33980 if (Op.isUndef())
33981 continue;
33982 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
33983 if (!ConstNode)
33984 return -1;
33985 if (ConstNode->getAPIntValue().isAllOnesValue()) {
33986 // If we already found a one, this is too many.
33987 if (TrueIndex >= 0)
33988 return -1;
33989 TrueIndex = i;
33990 }
33991 }
33992 return TrueIndex;
33993}
33994
33995/// Given a masked memory load/store operation, return true if it has one mask
33996/// bit set. If it has one mask bit set, then also return the memory address of
33997/// the scalar element to load/store, the vector index to insert/extract that
33998/// scalar element, and the alignment for the scalar memory access.
33999static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34000 SelectionDAG &DAG, SDValue &Addr,
34001 SDValue &Index, unsigned &Alignment) {
34002 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34003 if (TrueMaskElt < 0)
34004 return false;
34005
34006 // Get the address of the one scalar element that is specified by the mask
34007 // using the appropriate offset from the base pointer.
34008 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34009 Addr = MaskedOp->getBasePtr();
34010 if (TrueMaskElt != 0) {
34011 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34012 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34013 }
34014
34015 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34016 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34017 return true;
34018}
34019
34020/// If exactly one element of the mask is set for a non-extending masked load,
34021/// it is a scalar load and vector insert.
34022/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34023/// mask have already been optimized in IR, so we don't bother with those here.
34024static SDValue
34025reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34026 TargetLowering::DAGCombinerInfo &DCI) {
34027 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34028 // However, some target hooks may need to be added to know when the transform
34029 // is profitable. Endianness would also have to be considered.
34030
34031 SDValue Addr, VecIndex;
34032 unsigned Alignment;
34033 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34034 return SDValue();
34035
34036 // Load the one scalar element that is specified by the mask using the
34037 // appropriate offset from the base pointer.
34038 SDLoc DL(ML);
34039 EVT VT = ML->getValueType(0);
34040 EVT EltVT = VT.getVectorElementType();
34041 SDValue Load =
34042 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34043 Alignment, ML->getMemOperand()->getFlags());
34044
34045 // Insert the loaded element into the appropriate place in the vector.
34046 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34047 Load, VecIndex);
34048 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34049}
34050
34051static SDValue
34052combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34053 TargetLowering::DAGCombinerInfo &DCI) {
34054 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
34055 return SDValue();
34056
34057 SDLoc DL(ML);
34058 EVT VT = ML->getValueType(0);
34059
34060 // If we are loading the first and last elements of a vector, it is safe and
34061 // always faster to load the whole vector. Replace the masked load with a
34062 // vector load and select.
34063 unsigned NumElts = VT.getVectorNumElements();
34064 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
34065 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
34066 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
34067 if (LoadFirstElt && LoadLastElt) {
34068 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34069 ML->getMemOperand());
34070 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
34071 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
34072 }
34073
34074 // Convert a masked load with a constant mask into a masked load and a select.
34075 // This allows the select operation to use a faster kind of select instruction
34076 // (for example, vblendvps -> vblendps).
34077
34078 // Don't try this if the pass-through operand is already undefined. That would
34079 // cause an infinite loop because that's what we're about to create.
34080 if (ML->getSrc0().isUndef())
34081 return SDValue();
34082
34083 // The new masked load has an undef pass-through operand. The select uses the
34084 // original pass-through operand.
34085 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34086 ML->getMask(), DAG.getUNDEF(VT),
34087 ML->getMemoryVT(), ML->getMemOperand(),
34088 ML->getExtensionType());
34089 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
34090
34091 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
34092}
34093
34094static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
34095 TargetLowering::DAGCombinerInfo &DCI,
34096 const X86Subtarget &Subtarget) {
34097 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
34098
34099 // TODO: Expanding load with constant mask may be optimized as well.
34100 if (Mld->isExpandingLoad())
34101 return SDValue();
34102
34103 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
34104 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
34105 return ScalarLoad;
34106 // TODO: Do some AVX512 subsets benefit from this transform?
34107 if (!Subtarget.hasAVX512())
34108 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
34109 return Blend;
34110 }
34111
34112 if (Mld->getExtensionType() != ISD::SEXTLOAD)
34113 return SDValue();
34114
34115 // Resolve extending loads.
34116 EVT VT = Mld->getValueType(0);
34117 unsigned NumElems = VT.getVectorNumElements();
34118 EVT LdVT = Mld->getMemoryVT();
34119 SDLoc dl(Mld);
34120
34121 assert(LdVT != VT && "Cannot extend to the same type")(static_cast <bool> (LdVT != VT && "Cannot extend to the same type"
) ? void (0) : __assert_fail ("LdVT != VT && \"Cannot extend to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34121, __extension__ __PRETTY_FUNCTION__))
;
34122 unsigned ToSz = VT.getScalarSizeInBits();
34123 unsigned FromSz = LdVT.getScalarSizeInBits();
34124 // From/To sizes and ElemCount must be pow of two.
34125 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for extending masked load") ? void
(0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34126, __extension__ __PRETTY_FUNCTION__))
34126 "Unexpected size for extending masked load")(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for extending masked load") ? void
(0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for extending masked load\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34126, __extension__ __PRETTY_FUNCTION__))
;
34127
34128 unsigned SizeRatio = ToSz / FromSz;
34129 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits())(static_cast <bool> (SizeRatio * NumElems * FromSz == VT
.getSizeInBits()) ? void (0) : __assert_fail ("SizeRatio * NumElems * FromSz == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34129, __extension__ __PRETTY_FUNCTION__))
;
34130
34131 // Create a type on which we perform the shuffle.
34132 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34133 LdVT.getScalarType(), NumElems*SizeRatio);
34134 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (WideVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34134, __extension__ __PRETTY_FUNCTION__))
;
34135
34136 // Convert Src0 value.
34137 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
34138 if (!Mld->getSrc0().isUndef()) {
34139 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34140 for (unsigned i = 0; i != NumElems; ++i)
34141 ShuffleVec[i] = i * SizeRatio;
34142
34143 // Can't shuffle using an illegal type.
34144 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34145, __extension__ __PRETTY_FUNCTION__))
34145 "WideVecVT should be legal")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34145, __extension__ __PRETTY_FUNCTION__))
;
34146 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
34147 DAG.getUNDEF(WideVecVT), ShuffleVec);
34148 }
34149
34150 // Prepare the new mask.
34151 SDValue NewMask;
34152 SDValue Mask = Mld->getMask();
34153 if (Mask.getValueType() == VT) {
34154 // Mask and original value have the same type.
34155 NewMask = DAG.getBitcast(WideVecVT, Mask);
34156 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34157 for (unsigned i = 0; i != NumElems; ++i)
34158 ShuffleVec[i] = i * SizeRatio;
34159 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
34160 ShuffleVec[i] = NumElems * SizeRatio;
34161 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34162 DAG.getConstant(0, dl, WideVecVT),
34163 ShuffleVec);
34164 } else {
34165 assert(Mask.getValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Mask.getValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34165, __extension__ __PRETTY_FUNCTION__))
;
34166 unsigned WidenNumElts = NumElems*SizeRatio;
34167 unsigned MaskNumElts = VT.getVectorNumElements();
34168 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34169 WidenNumElts);
34170
34171 unsigned NumConcat = WidenNumElts / MaskNumElts;
34172 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34173 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34174 Ops[0] = Mask;
34175 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34176 }
34177
34178 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
34179 Mld->getBasePtr(), NewMask, WideSrc0,
34180 Mld->getMemoryVT(), Mld->getMemOperand(),
34181 ISD::NON_EXTLOAD);
34182 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
34183 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
34184}
34185
34186/// If exactly one element of the mask is set for a non-truncating masked store,
34187/// it is a vector extract and scalar store.
34188/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34189/// mask have already been optimized in IR, so we don't bother with those here.
34190static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
34191 SelectionDAG &DAG) {
34192 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34193 // However, some target hooks may need to be added to know when the transform
34194 // is profitable. Endianness would also have to be considered.
34195
34196 SDValue Addr, VecIndex;
34197 unsigned Alignment;
34198 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
34199 return SDValue();
34200
34201 // Extract the one scalar element that is actually being stored.
34202 SDLoc DL(MS);
34203 EVT VT = MS->getValue().getValueType();
34204 EVT EltVT = VT.getVectorElementType();
34205 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
34206 MS->getValue(), VecIndex);
34207
34208 // Store that element at the appropriate offset from the base pointer.
34209 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
34210 Alignment, MS->getMemOperand()->getFlags());
34211}
34212
34213static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
34214 const X86Subtarget &Subtarget) {
34215 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34216
34217 if (Mst->isCompressingStore())
34218 return SDValue();
34219
34220 if (!Mst->isTruncatingStore()) {
34221 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34222 return ScalarStore;
34223
34224 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34225 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34226 // cares about the sign bit of each mask element, so eliminate the compare:
34227 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34228 // Note that by waiting to match an x86-specific PCMPGT node, we're
34229 // eliminating potentially more complex matching of a setcc node which has
34230 // a full range of predicates.
34231 SDValue Mask = Mst->getMask();
34232 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34233 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34234 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&(static_cast <bool> (Mask.getValueType() == Mask.getOperand
(1).getValueType() && "Unexpected type for PCMPGT") ?
void (0) : __assert_fail ("Mask.getValueType() == Mask.getOperand(1).getValueType() && \"Unexpected type for PCMPGT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34235, __extension__ __PRETTY_FUNCTION__))
34235 "Unexpected type for PCMPGT")(static_cast <bool> (Mask.getValueType() == Mask.getOperand
(1).getValueType() && "Unexpected type for PCMPGT") ?
void (0) : __assert_fail ("Mask.getValueType() == Mask.getOperand(1).getValueType() && \"Unexpected type for PCMPGT\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34235, __extension__ __PRETTY_FUNCTION__))
;
34236 return DAG.getMaskedStore(
34237 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34238 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34239 }
34240
34241 // TODO: AVX512 targets should also be able to simplify something like the
34242 // pattern above, but that pattern will be different. It will either need to
34243 // match setcc more generally or match PCMPGTM later (in tablegen?).
34244
34245 return SDValue();
34246 }
34247
34248 // Resolve truncating stores.
34249 EVT VT = Mst->getValue().getValueType();
34250 unsigned NumElems = VT.getVectorNumElements();
34251 EVT StVT = Mst->getMemoryVT();
34252 SDLoc dl(Mst);
34253
34254 assert(StVT != VT && "Cannot truncate to the same type")(static_cast <bool> (StVT != VT && "Cannot truncate to the same type"
) ? void (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34254, __extension__ __PRETTY_FUNCTION__))
;
34255 unsigned FromSz = VT.getScalarSizeInBits();
34256 unsigned ToSz = StVT.getScalarSizeInBits();
34257
34258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34259
34260 // The truncating store is legal in some cases. For example
34261 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34262 // are designated for truncate store.
34263 // In this case we don't need any further transformations.
34264 if (TLI.isTruncStoreLegal(VT, StVT))
34265 return SDValue();
34266
34267 // From/To sizes and ElemCount must be pow of two.
34268 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for truncating masked store") ?
void (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34269, __extension__ __PRETTY_FUNCTION__))
34269 "Unexpected size for truncating masked store")(static_cast <bool> (isPowerOf2_32(NumElems * FromSz * ToSz
) && "Unexpected size for truncating masked store") ?
void (0) : __assert_fail ("isPowerOf2_32(NumElems * FromSz * ToSz) && \"Unexpected size for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34269, __extension__ __PRETTY_FUNCTION__))
;
34270 // We are going to use the original vector elt for storing.
34271 // Accumulated smaller vector elements must be a multiple of the store size.
34272 assert (((NumElems * FromSz) % ToSz) == 0 &&(static_cast <bool> (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store") ? void (0) :
__assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34273, __extension__ __PRETTY_FUNCTION__))
34273 "Unexpected ratio for truncating masked store")(static_cast <bool> (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store") ? void (0) :
__assert_fail ("((NumElems * FromSz) % ToSz) == 0 && \"Unexpected ratio for truncating masked store\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34273, __extension__ __PRETTY_FUNCTION__))
;
34274
34275 unsigned SizeRatio = FromSz / ToSz;
34276 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())(static_cast <bool> (SizeRatio * NumElems * ToSz == VT.
getSizeInBits()) ? void (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34276, __extension__ __PRETTY_FUNCTION__))
;
34277
34278 // Create a type on which we perform the shuffle.
34279 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34280 StVT.getScalarType(), NumElems*SizeRatio);
34281
34282 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (WideVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34282, __extension__ __PRETTY_FUNCTION__))
;
34283
34284 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34285 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34286 for (unsigned i = 0; i != NumElems; ++i)
34287 ShuffleVec[i] = i * SizeRatio;
34288
34289 // Can't shuffle using an illegal type.
34290 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34291, __extension__ __PRETTY_FUNCTION__))
34291 "WideVecVT should be legal")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(WideVecVT) && "WideVecVT should be legal") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && \"WideVecVT should be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34291, __extension__ __PRETTY_FUNCTION__))
;
34292
34293 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34294 DAG.getUNDEF(WideVecVT),
34295 ShuffleVec);
34296
34297 SDValue NewMask;
34298 SDValue Mask = Mst->getMask();
34299 if (Mask.getValueType() == VT) {
34300 // Mask and original value have the same type.
34301 NewMask = DAG.getBitcast(WideVecVT, Mask);
34302 for (unsigned i = 0; i != NumElems; ++i)
34303 ShuffleVec[i] = i * SizeRatio;
34304 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34305 ShuffleVec[i] = NumElems*SizeRatio;
34306 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34307 DAG.getConstant(0, dl, WideVecVT),
34308 ShuffleVec);
34309 } else {
34310 assert(Mask.getValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Mask.getValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Mask.getValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34310, __extension__ __PRETTY_FUNCTION__))
;
34311 unsigned WidenNumElts = NumElems*SizeRatio;
34312 unsigned MaskNumElts = VT.getVectorNumElements();
34313 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34314 WidenNumElts);
34315
34316 unsigned NumConcat = WidenNumElts / MaskNumElts;
34317 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34318 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34319 Ops[0] = Mask;
34320 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34321 }
34322
34323 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34324 Mst->getBasePtr(), NewMask, StVT,
34325 Mst->getMemOperand(), false);
34326}
34327
34328static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34329 const X86Subtarget &Subtarget) {
34330 StoreSDNode *St = cast<StoreSDNode>(N);
34331 EVT VT = St->getValue().getValueType();
34332 EVT StVT = St->getMemoryVT();
34333 SDLoc dl(St);
34334 SDValue StoredVal = St->getOperand(1);
34335 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34336
34337 // If we are saving a concatenation of two XMM registers and 32-byte stores
34338 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34339 bool Fast;
34340 unsigned AddressSpace = St->getAddressSpace();
34341 unsigned Alignment = St->getAlignment();
34342 if (VT.is256BitVector() && StVT == VT &&
34343 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34344 AddressSpace, Alignment, &Fast) &&
34345 !Fast) {
34346 unsigned NumElems = VT.getVectorNumElements();
34347 if (NumElems < 2)
34348 return SDValue();
34349
34350 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34351 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34352
34353 SDValue Ptr0 = St->getBasePtr();
34354 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34355
34356 SDValue Ch0 =
34357 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34358 Alignment, St->getMemOperand()->getFlags());
34359 SDValue Ch1 =
34360 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
34361 std::min(16U, Alignment), St->getMemOperand()->getFlags());
34362 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34363 }
34364
34365 // Optimize trunc store (of multiple scalars) to shuffle and store.
34366 // First, pack all of the elements in one place. Next, store to memory
34367 // in fewer chunks.
34368 if (St->isTruncatingStore() && VT.isVector()) {
34369 // Check if we can detect an AVG pattern from the truncation. If yes,
34370 // replace the trunc store by a normal store with the result of X86ISD::AVG
34371 // instruction.
34372 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34373 Subtarget, dl))
34374 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34375 St->getPointerInfo(), St->getAlignment(),
34376 St->getMemOperand()->getFlags());
34377
34378 if (SDValue Val =
34379 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
34380 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34381 dl, Val, St->getBasePtr(),
34382 St->getMemoryVT(), St->getMemOperand(), DAG);
34383
34384 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34385 unsigned NumElems = VT.getVectorNumElements();
34386 assert(StVT != VT && "Cannot truncate to the same type")(static_cast <bool> (StVT != VT && "Cannot truncate to the same type"
) ? void (0) : __assert_fail ("StVT != VT && \"Cannot truncate to the same type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34386, __extension__ __PRETTY_FUNCTION__))
;
34387 unsigned FromSz = VT.getScalarSizeInBits();
34388 unsigned ToSz = StVT.getScalarSizeInBits();
34389
34390 // The truncating store is legal in some cases. For example
34391 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34392 // are designated for truncate store.
34393 // In this case we don't need any further transformations.
34394 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34395 return SDValue();
34396
34397 // From, To sizes and ElemCount must be pow of two
34398 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34399 // We are going to use the original vector elt for storing.
34400 // Accumulated smaller vector elements must be a multiple of the store size.
34401 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34402
34403 unsigned SizeRatio = FromSz / ToSz;
34404
34405 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits())(static_cast <bool> (SizeRatio * NumElems * ToSz == VT.
getSizeInBits()) ? void (0) : __assert_fail ("SizeRatio * NumElems * ToSz == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34405, __extension__ __PRETTY_FUNCTION__))
;
34406
34407 // Create a type on which we perform the shuffle
34408 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34409 StVT.getScalarType(), NumElems*SizeRatio);
34410
34411 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (WideVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("WideVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34411, __extension__ __PRETTY_FUNCTION__))
;
34412
34413 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34414 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34415 for (unsigned i = 0; i != NumElems; ++i)
34416 ShuffleVec[i] = i * SizeRatio;
34417
34418 // Can't shuffle using an illegal type.
34419 if (!TLI.isTypeLegal(WideVecVT))
34420 return SDValue();
34421
34422 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34423 DAG.getUNDEF(WideVecVT),
34424 ShuffleVec);
34425 // At this point all of the data is stored at the bottom of the
34426 // register. We now need to save it to mem.
34427
34428 // Find the largest store unit
34429 MVT StoreType = MVT::i8;
34430 for (MVT Tp : MVT::integer_valuetypes()) {
34431 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34432 StoreType = Tp;
34433 }
34434
34435 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34436 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34437 (64 <= NumElems * ToSz))
34438 StoreType = MVT::f64;
34439
34440 // Bitcast the original vector into a vector of store-size units
34441 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34442 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34443 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits())(static_cast <bool> (StoreVecVT.getSizeInBits() == VT.getSizeInBits
()) ? void (0) : __assert_fail ("StoreVecVT.getSizeInBits() == VT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34443, __extension__ __PRETTY_FUNCTION__))
;
34444 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34445 SmallVector<SDValue, 8> Chains;
34446 SDValue Ptr = St->getBasePtr();
34447
34448 // Perform one or more big stores into memory.
34449 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34450 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34451 StoreType, ShuffWide,
34452 DAG.getIntPtrConstant(i, dl));
34453 SDValue Ch =
34454 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34455 St->getAlignment(), St->getMemOperand()->getFlags());
34456 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34457 Chains.push_back(Ch);
34458 }
34459
34460 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34461 }
34462
34463 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34464 // the FP state in cases where an emms may be missing.
34465 // A preferable solution to the general problem is to figure out the right
34466 // places to insert EMMS. This qualifies as a quick hack.
34467
34468 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34469 if (VT.getSizeInBits() != 64)
34470 return SDValue();
34471
34472 const Function *F = DAG.getMachineFunction().getFunction();
34473 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
34474 bool F64IsLegal =
34475 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34476 if ((VT.isVector() ||
34477 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34478 isa<LoadSDNode>(St->getValue()) &&
34479 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34480 St->getChain().hasOneUse() && !St->isVolatile()) {
34481 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
34482 SmallVector<SDValue, 8> Ops;
34483
34484 if (!ISD::isNormalLoad(Ld))
34485 return SDValue();
34486
34487 // If this is not the MMX case, i.e. we are just turning i64 load/store
34488 // into f64 load/store, avoid the transformation if there are multiple
34489 // uses of the loaded value.
34490 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34491 return SDValue();
34492
34493 SDLoc LdDL(Ld);
34494 SDLoc StDL(N);
34495 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34496 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34497 // pair instead.
34498 if (Subtarget.is64Bit() || F64IsLegal) {
34499 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34500 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34501 Ld->getMemOperand());
34502
34503 // Make sure new load is placed in same chain order.
34504 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34505 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
34506 St->getMemOperand());
34507 }
34508
34509 // Otherwise, lower to two pairs of 32-bit loads / stores.
34510 SDValue LoAddr = Ld->getBasePtr();
34511 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34512
34513 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34514 Ld->getPointerInfo(), Ld->getAlignment(),
34515 Ld->getMemOperand()->getFlags());
34516 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34517 Ld->getPointerInfo().getWithOffset(4),
34518 MinAlign(Ld->getAlignment(), 4),
34519 Ld->getMemOperand()->getFlags());
34520 // Make sure new loads are placed in same chain order.
34521 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34522 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34523
34524 LoAddr = St->getBasePtr();
34525 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34526
34527 SDValue LoSt =
34528 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
34529 St->getAlignment(), St->getMemOperand()->getFlags());
34530 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
34531 St->getPointerInfo().getWithOffset(4),
34532 MinAlign(St->getAlignment(), 4),
34533 St->getMemOperand()->getFlags());
34534 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34535 }
34536
34537 // This is similar to the above case, but here we handle a scalar 64-bit
34538 // integer store that is extracted from a vector on a 32-bit target.
34539 // If we have SSE2, then we can treat it like a floating-point double
34540 // to get past legalization. The execution dependencies fixup pass will
34541 // choose the optimal machine instruction for the store if this really is
34542 // an integer or v2f32 rather than an f64.
34543 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34544 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34545 SDValue OldExtract = St->getOperand(1);
34546 SDValue ExtOp0 = OldExtract.getOperand(0);
34547 unsigned VecSize = ExtOp0.getValueSizeInBits();
34548 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34549 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34550 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34551 BitCast, OldExtract.getOperand(1));
34552 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34553 St->getPointerInfo(), St->getAlignment(),
34554 St->getMemOperand()->getFlags());
34555 }
34556
34557 return SDValue();
34558}
34559
34560/// Return 'true' if this vector operation is "horizontal"
34561/// and return the operands for the horizontal operation in LHS and RHS. A
34562/// horizontal operation performs the binary operation on successive elements
34563/// of its first operand, then on successive elements of its second operand,
34564/// returning the resulting values in a vector. For example, if
34565/// A = < float a0, float a1, float a2, float a3 >
34566/// and
34567/// B = < float b0, float b1, float b2, float b3 >
34568/// then the result of doing a horizontal operation on A and B is
34569/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34570/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34571/// A horizontal-op B, for some already available A and B, and if so then LHS is
34572/// set to A, RHS to B, and the routine returns 'true'.
34573/// Note that the binary operation should have the property that if one of the
34574/// operands is UNDEF then the result is UNDEF.
34575static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34576 // Look for the following pattern: if
34577 // A = < float a0, float a1, float a2, float a3 >
34578 // B = < float b0, float b1, float b2, float b3 >
34579 // and
34580 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
34581 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
34582 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
34583 // which is A horizontal-op B.
34584
34585 // At least one of the operands should be a vector shuffle.
34586 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
34587 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
34588 return false;
34589
34590 MVT VT = LHS.getSimpleValueType();
34591
34592 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34593, __extension__ __PRETTY_FUNCTION__))
34593 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34593, __extension__ __PRETTY_FUNCTION__))
;
34594
34595 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
34596 // operate independently on 128-bit lanes.
34597 unsigned NumElts = VT.getVectorNumElements();
34598 unsigned NumLanes = VT.getSizeInBits()/128;
34599 unsigned NumLaneElts = NumElts / NumLanes;
34600 assert((NumLaneElts % 2 == 0) &&(static_cast <bool> ((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34601, __extension__ __PRETTY_FUNCTION__))
34601 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumLaneElts % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34601, __extension__ __PRETTY_FUNCTION__))
;
34602 unsigned HalfLaneElts = NumLaneElts/2;
34603
34604 // View LHS in the form
34605 // LHS = VECTOR_SHUFFLE A, B, LMask
34606 // If LHS is not a shuffle then pretend it is the shuffle
34607 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
34608 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
34609 // type VT.
34610 SDValue A, B;
34611 SmallVector<int, 16> LMask(NumElts);
34612 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34613 if (!LHS.getOperand(0).isUndef())
34614 A = LHS.getOperand(0);
34615 if (!LHS.getOperand(1).isUndef())
34616 B = LHS.getOperand(1);
34617 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
34618 std::copy(Mask.begin(), Mask.end(), LMask.begin());
34619 } else {
34620 if (!LHS.isUndef())
34621 A = LHS;
34622 for (unsigned i = 0; i != NumElts; ++i)
34623 LMask[i] = i;
34624 }
34625
34626 // Likewise, view RHS in the form
34627 // RHS = VECTOR_SHUFFLE C, D, RMask
34628 SDValue C, D;
34629 SmallVector<int, 16> RMask(NumElts);
34630 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34631 if (!RHS.getOperand(0).isUndef())
34632 C = RHS.getOperand(0);
34633 if (!RHS.getOperand(1).isUndef())
34634 D = RHS.getOperand(1);
34635 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
34636 std::copy(Mask.begin(), Mask.end(), RMask.begin());
34637 } else {
34638 if (!RHS.isUndef())
34639 C = RHS;
34640 for (unsigned i = 0; i != NumElts; ++i)
34641 RMask[i] = i;
34642 }
34643
34644 // Check that the shuffles are both shuffling the same vectors.
34645 if (!(A == C && B == D) && !(A == D && B == C))
34646 return false;
34647
34648 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
34649 if (!A.getNode() && !B.getNode())
34650 return false;
34651
34652 // If A and B occur in reverse order in RHS, then "swap" them (which means
34653 // rewriting the mask).
34654 if (A != C)
34655 ShuffleVectorSDNode::commuteMask(RMask);
34656
34657 // At this point LHS and RHS are equivalent to
34658 // LHS = VECTOR_SHUFFLE A, B, LMask
34659 // RHS = VECTOR_SHUFFLE A, B, RMask
34660 // Check that the masks correspond to performing a horizontal operation.
34661 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
34662 for (unsigned i = 0; i != NumLaneElts; ++i) {
34663 int LIdx = LMask[i+l], RIdx = RMask[i+l];
34664
34665 // Ignore any UNDEF components.
34666 if (LIdx < 0 || RIdx < 0 ||
34667 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
34668 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
34669 continue;
34670
34671 // Check that successive elements are being operated on. If not, this is
34672 // not a horizontal operation.
34673 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
34674 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
34675 if (!(LIdx == Index && RIdx == Index + 1) &&
34676 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
34677 return false;
34678 }
34679 }
34680
34681 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
34682 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
34683 return true;
34684}
34685
34686/// Do target-specific dag combines on floating-point adds/subs.
34687static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
34688 const X86Subtarget &Subtarget) {
34689 EVT VT = N->getValueType(0);
34690 SDValue LHS = N->getOperand(0);
34691 SDValue RHS = N->getOperand(1);
34692 bool IsFadd = N->getOpcode() == ISD::FADD;
34693 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(static_cast <bool> ((IsFadd || N->getOpcode() == ISD
::FSUB) && "Wrong opcode") ? void (0) : __assert_fail
("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34693, __extension__ __PRETTY_FUNCTION__))
;
34694
34695 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
34696 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
34697 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
34698 isHorizontalBinOp(LHS, RHS, IsFadd)) {
34699 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
34700 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
34701 }
34702 return SDValue();
34703}
34704
34705/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
34706/// the codegen.
34707/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
34708static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
34709 const X86Subtarget &Subtarget,
34710 SDLoc &DL) {
34711 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34711, __extension__ __PRETTY_FUNCTION__))
;
34712 SDValue Src = N->getOperand(0);
34713 unsigned Opcode = Src.getOpcode();
34714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34715
34716 EVT VT = N->getValueType(0);
34717 EVT SrcVT = Src.getValueType();
34718
34719 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
34720 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
34721
34722 // Repeated operand, so we are only trading one output truncation for
34723 // one input truncation.
34724 if (Op0 == Op1)
34725 return true;
34726
34727 // See if either operand has been extended from a smaller/equal size to
34728 // the truncation size, allowing a truncation to combine with the extend.
34729 unsigned Opcode0 = Op0.getOpcode();
34730 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
34731 Opcode0 == ISD::ZERO_EXTEND) &&
34732 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34733 return true;
34734
34735 unsigned Opcode1 = Op1.getOpcode();
34736 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
34737 Opcode1 == ISD::ZERO_EXTEND) &&
34738 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34739 return true;
34740
34741 // See if either operand is a single use constant which can be constant
34742 // folded.
34743 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
34744 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
34745 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
34746 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
34747 };
34748
34749 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
34750 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
34751 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
34752 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
34753 };
34754
34755 // Don't combine if the operation has other uses.
34756 if (!N->isOnlyUserOf(Src.getNode()))
34757 return SDValue();
34758
34759 // Only support vector truncation for now.
34760 // TODO: i64 scalar math would benefit as well.
34761 if (!VT.isVector())
34762 return SDValue();
34763
34764 // In most cases its only worth pre-truncating if we're only facing the cost
34765 // of one truncation.
34766 // i.e. if one of the inputs will constant fold or the input is repeated.
34767 switch (Opcode) {
34768 case ISD::AND:
34769 case ISD::XOR:
34770 case ISD::OR: {
34771 SDValue Op0 = Src.getOperand(0);
34772 SDValue Op1 = Src.getOperand(1);
34773 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
34774 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34775 return TruncateArithmetic(Op0, Op1);
34776 break;
34777 }
34778
34779 case ISD::MUL:
34780 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
34781 // better to truncate if we have the chance.
34782 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
34783 !TLI.isOperationLegal(Opcode, SrcVT))
34784 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
34785 LLVM_FALLTHROUGH[[clang::fallthrough]];
34786 case ISD::ADD: {
34787 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
34788 SDValue Op0 = Src.getOperand(0);
34789 SDValue Op1 = Src.getOperand(1);
34790 if (TLI.isOperationLegal(Opcode, VT) &&
34791 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34792 return TruncateArithmetic(Op0, Op1);
34793 break;
34794 }
34795 }
34796
34797 return SDValue();
34798}
34799
34800/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
34801static SDValue
34802combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
34803 SmallVector<SDValue, 8> &Regs) {
34804 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||(static_cast <bool> (Regs.size() > 0 && (Regs
[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() ==
MVT::v2i64)) ? void (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34805, __extension__ __PRETTY_FUNCTION__))
34805 Regs[0].getValueType() == MVT::v2i64))(static_cast <bool> (Regs.size() > 0 && (Regs
[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() ==
MVT::v2i64)) ? void (0) : __assert_fail ("Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34805, __extension__ __PRETTY_FUNCTION__))
;
34806 EVT OutVT = N->getValueType(0);
34807 EVT OutSVT = OutVT.getVectorElementType();
34808 EVT InVT = Regs[0].getValueType();
34809 EVT InSVT = InVT.getVectorElementType();
34810 SDLoc DL(N);
34811
34812 // First, use mask to unset all bits that won't appear in the result.
34813 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&(static_cast <bool> ((OutSVT == MVT::i8 || OutSVT == MVT
::i16) && "OutSVT can only be either i8 or i16.") ? void
(0) : __assert_fail ("(OutSVT == MVT::i8 || OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34814, __extension__ __PRETTY_FUNCTION__))
34814 "OutSVT can only be either i8 or i16.")(static_cast <bool> ((OutSVT == MVT::i8 || OutSVT == MVT
::i16) && "OutSVT can only be either i8 or i16.") ? void
(0) : __assert_fail ("(OutSVT == MVT::i8 || OutSVT == MVT::i16) && \"OutSVT can only be either i8 or i16.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34814, __extension__ __PRETTY_FUNCTION__))
;
34815 APInt Mask =
34816 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
34817 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
34818 for (auto &Reg : Regs)
34819 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
34820
34821 MVT UnpackedVT, PackedVT;
34822 if (OutSVT == MVT::i8) {
34823 UnpackedVT = MVT::v8i16;
34824 PackedVT = MVT::v16i8;
34825 } else {
34826 UnpackedVT = MVT::v4i32;
34827 PackedVT = MVT::v8i16;
34828 }
34829
34830 // In each iteration, truncate the type by a half size.
34831 auto RegNum = Regs.size();
34832 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
34833 j < e; j *= 2, RegNum /= 2) {
34834 for (unsigned i = 0; i < RegNum; i++)
34835 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
34836 for (unsigned i = 0; i < RegNum / 2; i++)
34837 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
34838 Regs[i * 2 + 1]);
34839 }
34840
34841 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
34842 // then extract a subvector as the result since v8i8 is not a legal type.
34843 if (OutVT == MVT::v8i8) {
34844 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
34845 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
34846 DAG.getIntPtrConstant(0, DL));
34847 return Regs[0];
34848 } else if (RegNum > 1) {
34849 Regs.resize(RegNum);
34850 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34851 } else
34852 return Regs[0];
34853}
34854
34855/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
34856static SDValue
34857combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
34858 SelectionDAG &DAG,
34859 SmallVector<SDValue, 8> &Regs) {
34860 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32)(static_cast <bool> (Regs.size() > 0 && Regs
[0].getValueType() == MVT::v4i32) ? void (0) : __assert_fail (
"Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 34860, __extension__ __PRETTY_FUNCTION__))
;
34861 EVT OutVT = N->getValueType(0);
34862 SDLoc DL(N);
34863
34864 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
34865 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
34866 for (auto &Reg : Regs) {
34867 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
34868 Subtarget, DAG);
34869 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
34870 Subtarget, DAG);
34871 }
34872
34873 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
34874 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
34875 Regs[i * 2 + 1]);
34876
34877 if (Regs.size() > 2) {
34878 Regs.resize(Regs.size() / 2);
34879 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34880 } else
34881 return Regs[0];
34882}
34883
34884/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
34885/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
34886/// legalization the truncation will be translated into a BUILD_VECTOR with each
34887/// element that is extracted from a vector and then truncated, and it is
34888/// difficult to do this optimization based on them.
34889static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
34890 const X86Subtarget &Subtarget) {
34891 EVT OutVT = N->getValueType(0);
34892 if (!OutVT.isVector())
34893 return SDValue();
34894
34895 SDValue In = N->getOperand(0);
34896 if (!In.getValueType().isSimple())
34897 return SDValue();
34898
34899 EVT InVT = In.getValueType();
34900 unsigned NumElems = OutVT.getVectorNumElements();
34901
34902 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
34903 // SSE2, and we need to take care of it specially.
34904 // AVX512 provides vpmovdb.
34905 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
34906 return SDValue();
34907
34908 EVT OutSVT = OutVT.getVectorElementType();
34909 EVT InSVT = InVT.getVectorElementType();
34910 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
34911 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
34912 NumElems >= 8))
34913 return SDValue();
34914
34915 // SSSE3's pshufb results in less instructions in the cases below.
34916 if (Subtarget.hasSSSE3() && NumElems == 8 &&
34917 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
34918 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
34919 return SDValue();
34920
34921 SDLoc DL(N);
34922
34923 // Split a long vector into vectors of legal type.
34924 unsigned RegNum = InVT.getSizeInBits() / 128;
34925 SmallVector<SDValue, 8> SubVec(RegNum);
34926 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
34927 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
34928
34929 for (unsigned i = 0; i < RegNum; i++)
34930 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
34931 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
34932
34933 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
34934 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
34935 // truncate 2 x v4i32 to v8i16.
34936 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
34937 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
34938 else if (InSVT == MVT::i32)
34939 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
34940 else
34941 return SDValue();
34942}
34943
34944/// This function transforms vector truncation of 'extended sign-bits' or
34945/// 'extended zero-bits' values.
34946/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
34947static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
34948 SelectionDAG &DAG,
34949 const X86Subtarget &Subtarget) {
34950 // Requires SSE2 but AVX512 has fast truncate.
34951 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
34952 return SDValue();
34953
34954 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
34955 return SDValue();
34956
34957 SDValue In = N->getOperand(0);
34958 if (!In.getValueType().isSimple())
34959 return SDValue();
34960
34961 MVT VT = N->getValueType(0).getSimpleVT();
34962 MVT SVT = VT.getScalarType();
34963
34964 MVT InVT = In.getValueType().getSimpleVT();
34965 MVT InSVT = InVT.getScalarType();
34966
34967 // Check we have a truncation suited for PACKSS.
34968 if (!VT.is128BitVector() && !VT.is256BitVector())
34969 return SDValue();
34970 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
34971 return SDValue();
34972 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
34973 return SDValue();
34974
34975 // Use PACKSS if the input has sign-bits that extend all the way to the
34976 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
34977 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
34978 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
34979 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
34980 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
34981
34982 // Use PACKUS if the input has zero-bits that extend all the way to the
34983 // packed/truncated value. e.g. masks, zext_in_reg, etc.
34984 KnownBits Known;
34985 DAG.computeKnownBits(In, Known);
34986 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
34987 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
34988 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
34989 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
34990
34991 return SDValue();
34992}
34993
34994static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
34995 const X86Subtarget &Subtarget) {
34996 EVT VT = N->getValueType(0);
34997 SDValue Src = N->getOperand(0);
34998 SDLoc DL(N);
34999
35000 // Attempt to pre-truncate inputs to arithmetic ops instead.
35001 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
35002 return V;
35003
35004 // Try to detect AVG pattern first.
35005 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
35006 return Avg;
35007
35008 // Try to combine truncation with unsigned saturation.
35009 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
35010 return Val;
35011
35012 // The bitcast source is a direct mmx result.
35013 // Detect bitcasts between i32 to x86mmx
35014 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
35015 SDValue BCSrc = Src.getOperand(0);
35016 if (BCSrc.getValueType() == MVT::x86mmx)
35017 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
35018 }
35019
35020 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
35021 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
35022 return V;
35023
35024 return combineVectorTruncation(N, DAG, Subtarget);
35025}
35026
35027/// Returns the negated value if the node \p N flips sign of FP value.
35028///
35029/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
35030/// AVX512F does not have FXOR, so FNEG is lowered as
35031/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
35032/// In this case we go though all bitcasts.
35033static SDValue isFNEG(SDNode *N) {
35034 if (N->getOpcode() == ISD::FNEG)
35035 return N->getOperand(0);
35036
35037 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
35038 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
35039 return SDValue();
35040
35041 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
35042 if (!Op1.getValueType().isFloatingPoint())
35043 return SDValue();
35044
35045 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
35046
35047 unsigned EltBits = Op1.getScalarValueSizeInBits();
35048 auto isSignMask = [&](const ConstantFP *C) {
35049 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
35050 };
35051
35052 // There is more than one way to represent the same constant on
35053 // the different X86 targets. The type of the node may also depend on size.
35054 // - load scalar value and broadcast
35055 // - BUILD_VECTOR node
35056 // - load from a constant pool.
35057 // We check all variants here.
35058 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
35059 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
35060 if (isSignMask(cast<ConstantFP>(C)))
35061 return Op0;
35062
35063 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
35064 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
35065 if (isSignMask(CN->getConstantFPValue()))
35066 return Op0;
35067
35068 } else if (auto *C = getTargetConstantFromNode(Op1)) {
35069 if (C->getType()->isVectorTy()) {
35070 if (auto *SplatV = C->getSplatValue())
35071 if (isSignMask(cast<ConstantFP>(SplatV)))
35072 return Op0;
35073 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
35074 if (isSignMask(FPConst))
35075 return Op0;
35076 }
35077 return SDValue();
35078}
35079
35080/// Do target-specific dag combines on floating point negations.
35081static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
35082 const X86Subtarget &Subtarget) {
35083 EVT OrigVT = N->getValueType(0);
35084 SDValue Arg = isFNEG(N);
35085 assert(Arg.getNode() && "N is expected to be an FNEG node")(static_cast <bool> (Arg.getNode() && "N is expected to be an FNEG node"
) ? void (0) : __assert_fail ("Arg.getNode() && \"N is expected to be an FNEG node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35085, __extension__ __PRETTY_FUNCTION__))
;
35086
35087 EVT VT = Arg.getValueType();
35088 EVT SVT = VT.getScalarType();
35089 SDLoc DL(N);
35090
35091 // Let legalize expand this if it isn't a legal type yet.
35092 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35093 return SDValue();
35094
35095 // If we're negating a FMUL node on a target with FMA, then we can avoid the
35096 // use of a constant by performing (-0 - A*B) instead.
35097 // FIXME: Check rounding control flags as well once it becomes available.
35098 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
35099 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
35100 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
35101 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
35102 Arg.getOperand(1), Zero);
35103 return DAG.getBitcast(OrigVT, NewNode);
35104 }
35105
35106 // If we're negating an FMA node, then we can adjust the
35107 // instruction to include the extra negation.
35108 unsigned NewOpcode = 0;
35109 if (Arg.hasOneUse()) {
35110 switch (Arg.getOpcode()) {
35111 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
35112 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
35113 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
35114 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
35115 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
35116 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
35117 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
35118 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
35119 // We can't handle scalar intrinsic node here because it would only
35120 // invert one element and not the whole vector. But we could try to handle
35121 // a negation of the lower element only.
35122 }
35123 }
35124 if (NewOpcode)
35125 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
35126 Arg.getNode()->ops()));
35127
35128 return SDValue();
35129}
35130
35131static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
35132 const X86Subtarget &Subtarget) {
35133 MVT VT = N->getSimpleValueType(0);
35134 // If we have integer vector types available, use the integer opcodes.
35135 if (VT.isVector() && Subtarget.hasSSE2()) {
35136 SDLoc dl(N);
35137
35138 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
35139
35140 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
35141 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
35142 unsigned IntOpcode;
35143 switch (N->getOpcode()) {
35144 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35144)
;
35145 case X86ISD::FOR: IntOpcode = ISD::OR; break;
35146 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
35147 case X86ISD::FAND: IntOpcode = ISD::AND; break;
35148 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
35149 }
35150 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
35151 return DAG.getBitcast(VT, IntOp);
35152 }
35153 return SDValue();
35154}
35155
35156
35157/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
35158static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
35159 if (N->getOpcode() != ISD::XOR)
35160 return SDValue();
35161
35162 SDValue LHS = N->getOperand(0);
35163 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
35164 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
35165 return SDValue();
35166
35167 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
35168 X86::CondCode(LHS->getConstantOperandVal(0)));
35169 SDLoc DL(N);
35170 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
35171}
35172
35173static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
35174 TargetLowering::DAGCombinerInfo &DCI,
35175 const X86Subtarget &Subtarget) {
35176 // If this is SSE1 only convert to FXOR to avoid scalarization.
35177 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
35178 N->getValueType(0) == MVT::v4i32) {
35179 return DAG.getBitcast(
35180 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
35181 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
35182 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
35183 }
35184
35185 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35186 return Cmp;
35187
35188 if (DCI.isBeforeLegalizeOps())
35189 return SDValue();
35190
35191 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35192 return SetCC;
35193
35194 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35195 return RV;
35196
35197 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35198 return FPLogic;
35199
35200 if (isFNEG(N))
35201 return combineFneg(N, DAG, Subtarget);
35202 return SDValue();
35203}
35204
35205
35206static bool isNullFPScalarOrVectorConst(SDValue V) {
35207 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35208}
35209
35210/// If a value is a scalar FP zero or a vector FP zero (potentially including
35211/// undefined elements), return a zero constant that may be used to fold away
35212/// that value. In the case of a vector, the returned constant will not contain
35213/// undefined elements even if the input parameter does. This makes it suitable
35214/// to be used as a replacement operand with operations (eg, bitwise-and) where
35215/// an undef should not propagate.
35216static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35217 const X86Subtarget &Subtarget) {
35218 if (!isNullFPScalarOrVectorConst(V))
35219 return SDValue();
35220
35221 if (V.getValueType().isVector())
35222 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35223
35224 return V;
35225}
35226
35227static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35228 const X86Subtarget &Subtarget) {
35229 SDValue N0 = N->getOperand(0);
35230 SDValue N1 = N->getOperand(1);
35231 EVT VT = N->getValueType(0);
35232 SDLoc DL(N);
35233
35234 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35235 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35236 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35237 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35238 return SDValue();
35239
35240 auto isAllOnesConstantFP = [](SDValue V) {
35241 if (V.getSimpleValueType().isVector())
35242 return ISD::isBuildVectorAllOnes(V.getNode());
35243 auto *C = dyn_cast<ConstantFPSDNode>(V);
35244 return C && C->getConstantFPValue()->isAllOnesValue();
35245 };
35246
35247 // fand (fxor X, -1), Y --> fandn X, Y
35248 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35249 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35250
35251 // fand X, (fxor Y, -1) --> fandn Y, X
35252 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35253 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35254
35255 return SDValue();
35256}
35257
35258/// Do target-specific dag combines on X86ISD::FAND nodes.
35259static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35260 const X86Subtarget &Subtarget) {
35261 // FAND(0.0, x) -> 0.0
35262 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35263 return V;
35264
35265 // FAND(x, 0.0) -> 0.0
35266 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35267 return V;
35268
35269 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35270 return V;
35271
35272 return lowerX86FPLogicOp(N, DAG, Subtarget);
35273}
35274
35275/// Do target-specific dag combines on X86ISD::FANDN nodes.
35276static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35277 const X86Subtarget &Subtarget) {
35278 // FANDN(0.0, x) -> x
35279 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35280 return N->getOperand(1);
35281
35282 // FANDN(x, 0.0) -> 0.0
35283 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35284 return V;
35285
35286 return lowerX86FPLogicOp(N, DAG, Subtarget);
35287}
35288
35289/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35290static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35291 const X86Subtarget &Subtarget) {
35292 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35292, __extension__ __PRETTY_FUNCTION__))
;
35293
35294 // F[X]OR(0.0, x) -> x
35295 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35296 return N->getOperand(1);
35297
35298 // F[X]OR(x, 0.0) -> x
35299 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35300 return N->getOperand(0);
35301
35302 if (isFNEG(N))
35303 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35304 return NewVal;
35305
35306 return lowerX86FPLogicOp(N, DAG, Subtarget);
35307}
35308
35309/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35310static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35311 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35311, __extension__ __PRETTY_FUNCTION__))
;
35312
35313 // Only perform optimizations if UnsafeMath is used.
35314 if (!DAG.getTarget().Options.UnsafeFPMath)
35315 return SDValue();
35316
35317 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35318 // into FMINC and FMAXC, which are Commutative operations.
35319 unsigned NewOp = 0;
35320 switch (N->getOpcode()) {
35321 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35321)
;
35322 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35323 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35324 }
35325
35326 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35327 N->getOperand(0), N->getOperand(1));
35328}
35329
35330static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35331 const X86Subtarget &Subtarget) {
35332 if (Subtarget.useSoftFloat())
35333 return SDValue();
35334
35335 // TODO: Check for global or instruction-level "nnan". In that case, we
35336 // should be able to lower to FMAX/FMIN alone.
35337 // TODO: If an operand is already known to be a NaN or not a NaN, this
35338 // should be an optional swap and FMAX/FMIN.
35339
35340 EVT VT = N->getValueType(0);
35341 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35342 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35343 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35344 return SDValue();
35345
35346 // This takes at least 3 instructions, so favor a library call when operating
35347 // on a scalar and minimizing code size.
35348 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
35349 return SDValue();
35350
35351 SDValue Op0 = N->getOperand(0);
35352 SDValue Op1 = N->getOperand(1);
35353 SDLoc DL(N);
35354 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35355 DAG.getDataLayout(), *DAG.getContext(), VT);
35356
35357 // There are 4 possibilities involving NaN inputs, and these are the required
35358 // outputs:
35359 // Op1
35360 // Num NaN
35361 // ----------------
35362 // Num | Max | Op0 |
35363 // Op0 ----------------
35364 // NaN | Op1 | NaN |
35365 // ----------------
35366 //
35367 // The SSE FP max/min instructions were not designed for this case, but rather
35368 // to implement:
35369 // Min = Op1 < Op0 ? Op1 : Op0
35370 // Max = Op1 > Op0 ? Op1 : Op0
35371 //
35372 // So they always return Op0 if either input is a NaN. However, we can still
35373 // use those instructions for fmaxnum by selecting away a NaN input.
35374
35375 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35376 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35377 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35378 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35379
35380 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35381 // are NaN, the NaN value of Op1 is the result.
35382 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35383}
35384
35385/// Do target-specific dag combines on X86ISD::ANDNP nodes.
35386static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35387 TargetLowering::DAGCombinerInfo &DCI,
35388 const X86Subtarget &Subtarget) {
35389 // ANDNP(0, x) -> x
35390 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35391 return N->getOperand(1);
35392
35393 // ANDNP(x, 0) -> 0
35394 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35395 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35396
35397 EVT VT = N->getValueType(0);
35398
35399 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35400 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35401 SDValue Op(N, 0);
35402 if (SDValue Res = combineX86ShufflesRecursively(
35403 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35404 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35405 DCI.CombineTo(N, Res);
35406 return SDValue();
35407 }
35408 }
35409
35410 return SDValue();
35411}
35412
35413static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35414 TargetLowering::DAGCombinerInfo &DCI) {
35415 SDValue N0 = N->getOperand(0);
35416 SDValue N1 = N->getOperand(1);
35417
35418 // BT ignores high bits in the bit index operand.
35419 unsigned BitWidth = N1.getValueSizeInBits();
35420 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35421 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35422 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35423
35424 return SDValue();
35425}
35426
35427static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35428 const X86Subtarget &Subtarget) {
35429 EVT VT = N->getValueType(0);
35430 if (!VT.isVector())
35431 return SDValue();
35432
35433 SDValue N0 = N->getOperand(0);
35434 SDValue N1 = N->getOperand(1);
35435 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35436 SDLoc dl(N);
35437
35438 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35439 // both SSE and AVX2 since there is no sign-extended shift right
35440 // operation on a vector with 64-bit elements.
35441 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35442 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35443 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35444 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35445 SDValue N00 = N0.getOperand(0);
35446
35447 // EXTLOAD has a better solution on AVX2,
35448 // it may be replaced with X86ISD::VSEXT node.
35449 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35450 if (!ISD::isNormalLoad(N00.getNode()))
35451 return SDValue();
35452
35453 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35454 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35455 N00, N1);
35456 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35457 }
35458 }
35459 return SDValue();
35460}
35461
35462/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35463/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35464/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35465/// opportunities to combine math ops, use an LEA, or use a complex addressing
35466/// mode. This can eliminate extend, add, and shift instructions.
35467static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35468 const X86Subtarget &Subtarget) {
35469 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35470 Ext->getOpcode() != ISD::ZERO_EXTEND)
35471 return SDValue();
35472
35473 // TODO: This should be valid for other integer types.
35474 EVT VT = Ext->getValueType(0);
35475 if (VT != MVT::i64)
35476 return SDValue();
35477
35478 SDValue Add = Ext->getOperand(0);
35479 if (Add.getOpcode() != ISD::ADD)
35480 return SDValue();
35481
35482 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35483 bool NSW = Add->getFlags().hasNoSignedWrap();
35484 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35485
35486 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35487 // into the 'zext'
35488 if ((Sext && !NSW) || (!Sext && !NUW))
35489 return SDValue();
35490
35491 // Having a constant operand to the 'add' ensures that we are not increasing
35492 // the instruction count because the constant is extended for free below.
35493 // A constant operand can also become the displacement field of an LEA.
35494 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35495 if (!AddOp1)
35496 return SDValue();
35497
35498 // Don't make the 'add' bigger if there's no hope of combining it with some
35499 // other 'add' or 'shl' instruction.
35500 // TODO: It may be profitable to generate simpler LEA instructions in place
35501 // of single 'add' instructions, but the cost model for selecting an LEA
35502 // currently has a high threshold.
35503 bool HasLEAPotential = false;
35504 for (auto *User : Ext->uses()) {
35505 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35506 HasLEAPotential = true;
35507 break;
35508 }
35509 }
35510 if (!HasLEAPotential)
35511 return SDValue();
35512
35513 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35514 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35515 SDValue AddOp0 = Add.getOperand(0);
35516 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35517 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35518
35519 // The wider add is guaranteed to not wrap because both operands are
35520 // sign-extended.
35521 SDNodeFlags Flags;
35522 Flags.setNoSignedWrap(NSW);
35523 Flags.setNoUnsignedWrap(NUW);
35524 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35525}
35526
35527/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35528/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35529/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35530/// extends from AH (which we otherwise need to do contortions to access).
35531static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35532 SDValue N0 = N->getOperand(0);
35533 auto OpcodeN = N->getOpcode();
35534 auto OpcodeN0 = N0.getOpcode();
35535 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35536 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35537 return SDValue();
35538
35539 EVT VT = N->getValueType(0);
35540 EVT InVT = N0.getValueType();
35541 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35542 !(VT == MVT::i32 || VT == MVT::i64))
35543 return SDValue();
35544
35545 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35546 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35547 : X86ISD::UDIVREM8_ZEXT_HREG;
35548 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35549 N0.getOperand(1));
35550 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35551 // If this was a 64-bit extend, complete it.
35552 if (VT == MVT::i64)
35553 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35554 return R.getValue(1);
35555}
35556
35557// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35558// operands and the result of CMOV is not used anywhere else - promote CMOV
35559// itself instead of promoting its result. This could be beneficial, because:
35560// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35561// (or more) pseudo-CMOVs only when they go one-after-another and
35562// getting rid of result extension code after CMOV will help that.
35563// 2) Promotion of constant CMOV arguments is free, hence the
35564// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35565// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35566// promotion is also good in terms of code-size.
35567// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35568// promotion).
35569static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35570 SDValue CMovN = Extend->getOperand(0);
35571 if (CMovN.getOpcode() != X86ISD::CMOV)
35572 return SDValue();
35573
35574 EVT TargetVT = Extend->getValueType(0);
35575 unsigned ExtendOpcode = Extend->getOpcode();
35576 SDLoc DL(Extend);
35577
35578 EVT VT = CMovN.getValueType();
35579 SDValue CMovOp0 = CMovN.getOperand(0);
35580 SDValue CMovOp1 = CMovN.getOperand(1);
35581
35582 bool DoPromoteCMOV =
35583 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
35584 CMovN.hasOneUse() &&
35585 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
35586 isa<ConstantSDNode>(CMovOp1.getNode()));
35587
35588 if (!DoPromoteCMOV)
35589 return SDValue();
35590
35591 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
35592 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
35593
35594 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
35595 CMovN.getOperand(2), CMovN.getOperand(3));
35596}
35597
35598// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
35599// This is more or less the reverse of combineBitcastvxi1.
35600static SDValue
35601combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
35602 TargetLowering::DAGCombinerInfo &DCI,
35603 const X86Subtarget &Subtarget) {
35604 unsigned Opcode = N->getOpcode();
35605 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
35606 Opcode != ISD::ANY_EXTEND)
35607 return SDValue();
35608 if (!DCI.isBeforeLegalizeOps())
35609 return SDValue();
35610 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35611 return SDValue();
35612
35613 SDValue N0 = N->getOperand(0);
35614 EVT VT = N->getValueType(0);
35615 EVT SVT = VT.getScalarType();
35616 EVT InSVT = N0.getValueType().getScalarType();
35617 unsigned EltSizeInBits = SVT.getSizeInBits();
35618
35619 // Input type must be extending a bool vector (bit-casted from a scalar
35620 // integer) to legal integer types.
35621 if (!VT.isVector())
35622 return SDValue();
35623 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
35624 return SDValue();
35625 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
35626 return SDValue();
35627
35628 SDValue N00 = N0.getOperand(0);
35629 EVT SclVT = N0.getOperand(0).getValueType();
35630 if (!SclVT.isScalarInteger())
35631 return SDValue();
35632
35633 SDLoc DL(N);
35634 SDValue Vec;
35635 SmallVector<int, 32> ShuffleMask;
35636 unsigned NumElts = VT.getVectorNumElements();
35637 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35637, __extension__ __PRETTY_FUNCTION__))
;
35638
35639 // Broadcast the scalar integer to the vector elements.
35640 if (NumElts > EltSizeInBits) {
35641 // If the scalar integer is greater than the vector element size, then we
35642 // must split it down into sub-sections for broadcasting. For example:
35643 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
35644 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
35645 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35645, __extension__ __PRETTY_FUNCTION__))
;
35646 unsigned Scale = NumElts / EltSizeInBits;
35647 EVT BroadcastVT =
35648 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
35649 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
35650 Vec = DAG.getBitcast(VT, Vec);
35651
35652 for (unsigned i = 0; i != Scale; ++i)
35653 ShuffleMask.append(EltSizeInBits, i);
35654 } else {
35655 // For smaller scalar integers, we can simply any-extend it to the vector
35656 // element size (we don't care about the upper bits) and broadcast it to all
35657 // elements.
35658 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
35659 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
35660 ShuffleMask.append(NumElts, 0);
35661 }
35662 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
35663
35664 // Now, mask the relevant bit in each element.
35665 SmallVector<SDValue, 32> Bits;
35666 for (unsigned i = 0; i != NumElts; ++i) {
35667 int BitIdx = (i % EltSizeInBits);
35668 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
35669 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
35670 }
35671 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
35672 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
35673
35674 // Compare against the bitmask and extend the result.
35675 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
35676 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
35677 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
35678
35679 // For SEXT, this is now done, otherwise shift the result down for
35680 // zero-extension.
35681 if (Opcode == ISD::SIGN_EXTEND)
35682 return Vec;
35683 return DAG.getNode(ISD::SRL, DL, VT, Vec,
35684 DAG.getConstant(EltSizeInBits - 1, DL, VT));
35685}
35686
35687/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
35688/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
35689/// with UNDEFs) of the input to vectors of the same size as the target type
35690/// which then extends the lowest elements.
35691static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
35692 TargetLowering::DAGCombinerInfo &DCI,
35693 const X86Subtarget &Subtarget) {
35694 unsigned Opcode = N->getOpcode();
35695 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
35696 return SDValue();
35697 if (!DCI.isBeforeLegalizeOps())
35698 return SDValue();
35699 if (!Subtarget.hasSSE2())
35700 return SDValue();
35701
35702 SDValue N0 = N->getOperand(0);
35703 EVT VT = N->getValueType(0);
35704 EVT SVT = VT.getScalarType();
35705 EVT InVT = N0.getValueType();
35706 EVT InSVT = InVT.getScalarType();
35707
35708 // Input type must be a vector and we must be extending legal integer types.
35709 if (!VT.isVector())
35710 return SDValue();
35711 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
35712 return SDValue();
35713 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
35714 return SDValue();
35715
35716 // On AVX2+ targets, if the input/output types are both legal then we will be
35717 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
35718 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
35719 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
35720 return SDValue();
35721
35722 SDLoc DL(N);
35723
35724 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
35725 EVT InVT = N.getValueType();
35726 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
35727 Size / InVT.getScalarSizeInBits());
35728 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
35729 DAG.getUNDEF(InVT));
35730 Opnds[0] = N;
35731 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
35732 };
35733
35734 // If target-size is less than 128-bits, extend to a type that would extend
35735 // to 128 bits, extend that and extract the original target vector.
35736 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
35737 unsigned Scale = 128 / VT.getSizeInBits();
35738 EVT ExVT =
35739 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
35740 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
35741 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
35742 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
35743 DAG.getIntPtrConstant(0, DL));
35744 }
35745
35746 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
35747 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
35748 // Also use this if we don't have SSE41 to allow the legalizer do its job.
35749 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
35750 (VT.is256BitVector() && Subtarget.hasInt256()) ||
35751 (VT.is512BitVector() && Subtarget.hasAVX512())) {
35752 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
35753 return Opcode == ISD::SIGN_EXTEND
35754 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
35755 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
35756 }
35757
35758 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
35759 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
35760 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
35761 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
35762 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
35763
35764 SmallVector<SDValue, 8> Opnds;
35765 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
35766 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
35767 DAG.getIntPtrConstant(Offset, DL));
35768 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
35769 SrcVec = Opcode == ISD::SIGN_EXTEND
35770 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
35771 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
35772 Opnds.push_back(SrcVec);
35773 }
35774 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
35775 };
35776
35777 // On pre-AVX2 targets, split into 128-bit nodes of
35778 // ISD::*_EXTEND_VECTOR_INREG.
35779 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
35780 return SplitAndExtendInReg(128);
35781
35782 // On pre-AVX512 targets, split into 256-bit nodes of
35783 // ISD::*_EXTEND_VECTOR_INREG.
35784 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
35785 return SplitAndExtendInReg(256);
35786
35787 return SDValue();
35788}
35789
35790static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
35791 TargetLowering::DAGCombinerInfo &DCI,
35792 const X86Subtarget &Subtarget) {
35793 SDValue N0 = N->getOperand(0);
35794 EVT VT = N->getValueType(0);
35795 EVT InVT = N0.getValueType();
35796 SDLoc DL(N);
35797
35798 if (SDValue DivRem8 = getDivRem8(N, DAG))
35799 return DivRem8;
35800
35801 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
35802 return NewCMov;
35803
35804 if (!DCI.isBeforeLegalizeOps()) {
35805 if (InVT == MVT::i1) {
35806 SDValue Zero = DAG.getConstant(0, DL, VT);
35807 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
35808 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
35809 }
35810 return SDValue();
35811 }
35812
35813 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
35814 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
35815 // Invert and sign-extend a boolean is the same as zero-extend and subtract
35816 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
35817 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
35818 // sext (xor Bool, -1) --> sub (zext Bool), 1
35819 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
35820 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
35821 }
35822
35823 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
35824 return V;
35825
35826 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
35827 return V;
35828
35829 if (Subtarget.hasAVX() && VT.is256BitVector())
35830 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
35831 return R;
35832
35833 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
35834 return NewAdd;
35835
35836 return SDValue();
35837}
35838
35839static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
35840 const X86Subtarget &Subtarget) {
35841 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
35842 SDLoc dl(N);
35843 EVT VT = N->getValueType(0);
35844
35845 // Let legalize expand this if it isn't a legal type yet.
35846 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35847 return SDValue();
35848
35849 EVT ScalarVT = VT.getScalarType();
35850 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
35851 return SDValue();
35852
35853 SDValue A = N->getOperand(0);
35854 SDValue B = N->getOperand(1);
35855 SDValue C = N->getOperand(2);
35856
35857 auto invertIfNegative = [](SDValue &V) {
35858 if (SDValue NegVal = isFNEG(V.getNode())) {
35859 V = NegVal;
35860 return true;
35861 }
35862 return false;
35863 };
35864
35865 // Do not convert the passthru input of scalar intrinsics.
35866 // FIXME: We could allow negations of the lower element only.
35867 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
35868 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
35869 bool NegB = invertIfNegative(B);
35870 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
35871 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
35872
35873 // Negative multiplication when NegA xor NegB
35874 bool NegMul = (NegA != NegB);
35875 bool HasNeg = NegA || NegB || NegC;
35876
35877 unsigned NewOpcode;
35878 if (!NegMul)
35879 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
35880 else
35881 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
35882
35883 // For FMA, we risk reconstructing the node we started with.
35884 // In order to avoid this, we check for negation or opcode change. If
35885 // one of the two happened, then it is a new node and we return it.
35886 if (N->getOpcode() == ISD::FMA) {
35887 if (HasNeg || NewOpcode != N->getOpcode())
35888 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
35889 return SDValue();
35890 }
35891
35892 if (N->getOpcode() == X86ISD::FMADD_RND) {
35893 switch (NewOpcode) {
35894 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
35895 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
35896 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
35897 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
35898 }
35899 } else if (N->getOpcode() == X86ISD::FMADDS1) {
35900 switch (NewOpcode) {
35901 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
35902 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
35903 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
35904 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
35905 }
35906 } else if (N->getOpcode() == X86ISD::FMADDS3) {
35907 switch (NewOpcode) {
35908 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
35909 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
35910 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
35911 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
35912 }
35913 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
35914 switch (NewOpcode) {
35915 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
35916 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
35917 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
35918 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
35919 }
35920 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
35921 switch (NewOpcode) {
35922 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
35923 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
35924 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
35925 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
35926 }
35927 } else if (N->getOpcode() == X86ISD::FMADD4S) {
35928 switch (NewOpcode) {
35929 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
35930 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
35931 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
35932 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
35933 }
35934 } else {
35935 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35935)
;
35936 }
35937
35938 // Only return the node is the opcode was changed or one of the
35939 // operand was negated. If not, we'll just recreate the same node.
35940 if (HasNeg || NewOpcode != N->getOpcode()) {
35941 if (N->getNumOperands() == 4)
35942 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
35943 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
35944 }
35945
35946 return SDValue();
35947}
35948
35949// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
35950static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
35951 const X86Subtarget &Subtarget) {
35952 SDLoc dl(N);
35953 EVT VT = N->getValueType(0);
35954
35955 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
35956 if (!NegVal)
35957 return SDValue();
35958
35959 unsigned NewOpcode;
35960 switch (N->getOpcode()) {
35961 default: llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 35961)
;
35962 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
35963 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
35964 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
35965 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
35966 }
35967
35968 if (N->getNumOperands() == 4)
35969 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
35970 NegVal, N->getOperand(3));
35971 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
35972 NegVal);
35973}
35974
35975static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
35976 TargetLowering::DAGCombinerInfo &DCI,
35977 const X86Subtarget &Subtarget) {
35978 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
35979 // (and (i32 x86isd::setcc_carry), 1)
35980 // This eliminates the zext. This transformation is necessary because
35981 // ISD::SETCC is always legalized to i8.
35982 SDLoc dl(N);
35983 SDValue N0 = N->getOperand(0);
35984 EVT VT = N->getValueType(0);
35985
35986 if (N0.getOpcode() == ISD::AND &&
35987 N0.hasOneUse() &&
35988 N0.getOperand(0).hasOneUse()) {
35989 SDValue N00 = N0.getOperand(0);
35990 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
35991 if (!isOneConstant(N0.getOperand(1)))
35992 return SDValue();
35993 return DAG.getNode(ISD::AND, dl, VT,
35994 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
35995 N00.getOperand(0), N00.getOperand(1)),
35996 DAG.getConstant(1, dl, VT));
35997 }
35998 }
35999
36000 if (N0.getOpcode() == ISD::TRUNCATE &&
36001 N0.hasOneUse() &&
36002 N0.getOperand(0).hasOneUse()) {
36003 SDValue N00 = N0.getOperand(0);
36004 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36005 return DAG.getNode(ISD::AND, dl, VT,
36006 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36007 N00.getOperand(0), N00.getOperand(1)),
36008 DAG.getConstant(1, dl, VT));
36009 }
36010 }
36011
36012 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36013 return NewCMov;
36014
36015 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36016 return V;
36017
36018 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36019 return V;
36020
36021 if (VT.is256BitVector())
36022 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
36023 return R;
36024
36025 if (SDValue DivRem8 = getDivRem8(N, DAG))
36026 return DivRem8;
36027
36028 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36029 return NewAdd;
36030
36031 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
36032 return R;
36033
36034 return SDValue();
36035}
36036
36037/// Try to map a 128-bit or larger integer comparison to vector instructions
36038/// before type legalization splits it up into chunks.
36039static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
36040 const X86Subtarget &Subtarget) {
36041 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
36042 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36042, __extension__ __PRETTY_FUNCTION__))
;
36043
36044 // We're looking for an oversized integer equality comparison, but ignore a
36045 // comparison with zero because that gets special treatment in EmitTest().
36046 SDValue X = SetCC->getOperand(0);
36047 SDValue Y = SetCC->getOperand(1);
36048 EVT OpVT = X.getValueType();
36049 unsigned OpSize = OpVT.getSizeInBits();
36050 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
36051 return SDValue();
36052
36053 // Bail out if we know that this is not really just an oversized integer.
36054 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
36055 peekThroughBitcasts(Y).getValueType() == MVT::f128)
36056 return SDValue();
36057
36058 // TODO: Use PXOR + PTEST for SSE4.1 or later?
36059 // TODO: Add support for AVX-512.
36060 EVT VT = SetCC->getValueType(0);
36061 SDLoc DL(SetCC);
36062 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
36063 (OpSize == 256 && Subtarget.hasAVX2())) {
36064 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36065 SDValue VecX = DAG.getBitcast(VecVT, X);
36066 SDValue VecY = DAG.getBitcast(VecVT, Y);
36067
36068 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
36069 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
36070 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
36071 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
36072 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36073 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
36074 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
36075 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
36076 MVT::i32);
36077 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
36078 }
36079
36080 return SDValue();
36081}
36082
36083static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
36084 const X86Subtarget &Subtarget) {
36085 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
36086 SDValue LHS = N->getOperand(0);
36087 SDValue RHS = N->getOperand(1);
36088 EVT VT = N->getValueType(0);
36089 SDLoc DL(N);
36090
36091 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
36092 EVT OpVT = LHS.getValueType();
36093 // 0-x == y --> x+y == 0
36094 // 0-x != y --> x+y != 0
36095 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
36096 LHS.hasOneUse()) {
36097 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
36098 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36099 }
36100 // x == 0-y --> x+y == 0
36101 // x != 0-y --> x+y != 0
36102 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
36103 RHS.hasOneUse()) {
36104 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
36105 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36106 }
36107
36108 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
36109 return V;
36110 }
36111
36112 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
36113 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
36114 // Put build_vectors on the right.
36115 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
36116 std::swap(LHS, RHS);
36117 CC = ISD::getSetCCSwappedOperands(CC);
36118 }
36119
36120 bool IsSEXT0 =
36121 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
36122 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
36123 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
36124
36125 if (IsSEXT0 && IsVZero1) {
36126 assert(VT == LHS.getOperand(0).getValueType() &&(static_cast <bool> (VT == LHS.getOperand(0).getValueType
() && "Uexpected operand type") ? void (0) : __assert_fail
("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36127, __extension__ __PRETTY_FUNCTION__))
36127 "Uexpected operand type")(static_cast <bool> (VT == LHS.getOperand(0).getValueType
() && "Uexpected operand type") ? void (0) : __assert_fail
("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36127, __extension__ __PRETTY_FUNCTION__))
;
36128 if (CC == ISD::SETGT)
36129 return DAG.getConstant(0, DL, VT);
36130 if (CC == ISD::SETLE)
36131 return DAG.getConstant(1, DL, VT);
36132 if (CC == ISD::SETEQ || CC == ISD::SETGE)
36133 return DAG.getNOT(DL, LHS.getOperand(0), VT);
36134
36135 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETLT
) && "Unexpected condition code!") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36136, __extension__ __PRETTY_FUNCTION__))
36136 "Unexpected condition code!")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETLT
) && "Unexpected condition code!") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36136, __extension__ __PRETTY_FUNCTION__))
;
36137 return LHS.getOperand(0);
36138 }
36139 }
36140
36141 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
36142 // to avoid scalarization via legalization because v4i32 is not a legal type.
36143 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
36144 LHS.getValueType() == MVT::v4f32)
36145 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
36146
36147 return SDValue();
36148}
36149
36150static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
36151 TargetLowering::DAGCombinerInfo &DCI) {
36152 SDValue Src = N->getOperand(0);
36153 MVT SrcVT = Src.getSimpleValueType();
36154
36155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36156 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36157 !DCI.isBeforeLegalizeOps());
36158
36159 // MOVMSK only uses the MSB from each vector element.
36160 KnownBits Known;
36161 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
36162 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
36163 DCI.AddToWorklist(Src.getNode());
36164 DCI.CommitTargetLoweringOpt(TLO);
36165 return SDValue(N, 0);
36166 }
36167
36168 return SDValue();
36169}
36170
36171static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
36172 TargetLowering::DAGCombinerInfo &DCI,
36173 const X86Subtarget &Subtarget) {
36174 SDLoc DL(N);
36175
36176 // Pre-shrink oversized index elements to avoid triggering scalarization.
36177 if (DCI.isBeforeLegalize()) {
36178 SDValue Index = N->getOperand(4);
36179 if (Index.getScalarValueSizeInBits() > 64) {
36180 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
36181 Index.getValueType().getVectorNumElements());
36182 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
36183 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36184 NewOps[4] = Trunc;
36185 DAG.UpdateNodeOperands(N, NewOps);
36186 DCI.AddToWorklist(N);
36187 return SDValue(N, 0);
36188 }
36189 }
36190
36191 // Try to remove sign extends from i32 to i64 on the index.
36192 // Only do this before legalize in case we are relying on it for
36193 // legalization.
36194 // TODO: We should maybe remove any sign extend once we learn how to sign
36195 // extend narrow index during lowering.
36196 if (DCI.isBeforeLegalizeOps()) {
36197 SDValue Index = N->getOperand(4);
36198 if (Index.getScalarValueSizeInBits() == 64 &&
36199 Index.getOpcode() == ISD::SIGN_EXTEND &&
36200 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
36201 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36202 NewOps[4] = Index.getOperand(0);
36203 DAG.UpdateNodeOperands(N, NewOps);
36204 // The original sign extend has less users, add back to worklist in case
36205 // it needs to be removed.
36206 DCI.AddToWorklist(Index.getNode());
36207 DCI.AddToWorklist(N);
36208 return SDValue(N, 0);
36209 }
36210 }
36211
36212 // Gather and Scatter instructions use k-registers for masks. The type of
36213 // the masks is v*i1. So the mask will be truncated anyway.
36214 // The SIGN_EXTEND_INREG my be dropped.
36215 SDValue Mask = N->getOperand(2);
36216 if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
36217 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36218 NewOps[2] = Mask.getOperand(0);
36219 DAG.UpdateNodeOperands(N, NewOps);
36220 }
36221
36222 // With AVX2 we only demand the upper bit of the mask.
36223 if (!Subtarget.hasAVX512()) {
36224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36225 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36226 !DCI.isBeforeLegalizeOps());
36227 KnownBits Known;
36228 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36229 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36230 DCI.AddToWorklist(Mask.getNode());
36231 DCI.CommitTargetLoweringOpt(TLO);
36232 return SDValue(N, 0);
36233 }
36234 }
36235
36236 return SDValue();
36237}
36238
36239// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36240static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36241 const X86Subtarget &Subtarget) {
36242 SDLoc DL(N);
36243 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36244 SDValue EFLAGS = N->getOperand(1);
36245
36246 // Try to simplify the EFLAGS and condition code operands.
36247 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36248 return getSETCC(CC, Flags, DL, DAG);
36249
36250 return SDValue();
36251}
36252
36253/// Optimize branch condition evaluation.
36254static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36255 const X86Subtarget &Subtarget) {
36256 SDLoc DL(N);
36257 SDValue EFLAGS = N->getOperand(3);
36258 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36259
36260 // Try to simplify the EFLAGS and condition code operands.
36261 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36262 // RAUW them under us.
36263 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36264 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36265 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36266 N->getOperand(1), Cond, Flags);
36267 }
36268
36269 return SDValue();
36270}
36271
36272static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36273 SelectionDAG &DAG) {
36274 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36275 // optimize away operation when it's from a constant.
36276 //
36277 // The general transformation is:
36278 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36279 // AND(VECTOR_CMP(x,y), constant2)
36280 // constant2 = UNARYOP(constant)
36281
36282 // Early exit if this isn't a vector operation, the operand of the
36283 // unary operation isn't a bitwise AND, or if the sizes of the operations
36284 // aren't the same.
36285 EVT VT = N->getValueType(0);
36286 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36287 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36288 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
36289 return SDValue();
36290
36291 // Now check that the other operand of the AND is a constant. We could
36292 // make the transformation for non-constant splats as well, but it's unclear
36293 // that would be a benefit as it would not eliminate any operations, just
36294 // perform one more step in scalar code before moving to the vector unit.
36295 if (BuildVectorSDNode *BV =
36296 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36297 // Bail out if the vector isn't a constant.
36298 if (!BV->isConstant())
36299 return SDValue();
36300
36301 // Everything checks out. Build up the new and improved node.
36302 SDLoc DL(N);
36303 EVT IntVT = BV->getValueType(0);
36304 // Create a new constant of the appropriate type for the transformed
36305 // DAG.
36306 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36307 // The AND node needs bitcasts to/from an integer vector type around it.
36308 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36309 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36310 N->getOperand(0)->getOperand(0), MaskConst);
36311 SDValue Res = DAG.getBitcast(VT, NewAnd);
36312 return Res;
36313 }
36314
36315 return SDValue();
36316}
36317
36318static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36319 const X86Subtarget &Subtarget) {
36320 SDValue Op0 = N->getOperand(0);
36321 EVT VT = N->getValueType(0);
36322 EVT InVT = Op0.getValueType();
36323 EVT InSVT = InVT.getScalarType();
36324
36325 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36326 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36327 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36328 SDLoc dl(N);
36329 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36330 InVT.getVectorNumElements());
36331 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36332
36333 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36334 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36335 }
36336
36337 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36338 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36339 // the optimization here.
36340 if (DAG.SignBitIsZero(Op0))
36341 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36342
36343 return SDValue();
36344}
36345
36346static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36347 const X86Subtarget &Subtarget) {
36348 // First try to optimize away the conversion entirely when it's
36349 // conditionally from a constant. Vectors only.
36350 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36351 return Res;
36352
36353 // Now move on to more general possibilities.
36354 SDValue Op0 = N->getOperand(0);
36355 EVT VT = N->getValueType(0);
36356 EVT InVT = Op0.getValueType();
36357 EVT InSVT = InVT.getScalarType();
36358
36359 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36360 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36361 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36362 if (InVT.isVector() &&
36363 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36364 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36365 SDLoc dl(N);
36366 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36367 InVT.getVectorNumElements());
36368 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36369 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36370 }
36371
36372 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36373 // vectors and scalars, see if we know that the upper bits are all the sign
36374 // bit, in which case we can truncate the input to i32 and convert from that.
36375 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36376 unsigned BitWidth = InVT.getScalarSizeInBits();
36377 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36378 if (NumSignBits >= (BitWidth - 31)) {
36379 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36380 if (InVT.isVector())
36381 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36382 InVT.getVectorNumElements());
36383 SDLoc dl(N);
36384 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36385 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36386 }
36387 }
36388
36389 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36390 // a 32-bit target where SSE doesn't support i64->FP operations.
36391 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36392 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36393 EVT LdVT = Ld->getValueType(0);
36394
36395 // This transformation is not supported if the result type is f16 or f128.
36396 if (VT == MVT::f16 || VT == MVT::f128)
36397 return SDValue();
36398
36399 if (!Ld->isVolatile() && !VT.isVector() &&
36400 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36401 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36402 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36403 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36404 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36405 return FILDChain;
36406 }
36407 }
36408 return SDValue();
36409}
36410
36411static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36412 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36413 MVT VT = N->getSimpleValueType(0);
36414 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36415 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36416 N->getOperand(0), N->getOperand(1),
36417 Flags);
36418 }
36419
36420 return SDValue();
36421}
36422
36423// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36424static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36425 TargetLowering::DAGCombinerInfo &DCI) {
36426 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36427 // the result is either zero or one (depending on the input carry bit).
36428 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36429 if (X86::isZeroNode(N->getOperand(0)) &&
36430 X86::isZeroNode(N->getOperand(1)) &&
36431 // We don't have a good way to replace an EFLAGS use, so only do this when
36432 // dead right now.
36433 SDValue(N, 1).use_empty()) {
36434 SDLoc DL(N);
36435 EVT VT = N->getValueType(0);
36436 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36437 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36438 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36439 DAG.getConstant(X86::COND_B, DL,
36440 MVT::i8),
36441 N->getOperand(2)),
36442 DAG.getConstant(1, DL, VT));
36443 return DCI.CombineTo(N, Res1, CarryOut);
36444 }
36445
36446 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36447 MVT VT = N->getSimpleValueType(0);
36448 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36449 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36450 N->getOperand(0), N->getOperand(1),
36451 Flags);
36452 }
36453
36454 return SDValue();
36455}
36456
36457/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36458/// which is more useful than 0/1 in some cases.
36459static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36460 SDLoc DL(N);
36461 // "Condition code B" is also known as "the carry flag" (CF).
36462 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36463 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36464 MVT VT = N->getSimpleValueType(0);
36465 if (VT == MVT::i8)
36466 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36467
36468 assert(VT == MVT::i1 && "Unexpected type for SETCC node")(static_cast <bool> (VT == MVT::i1 && "Unexpected type for SETCC node"
) ? void (0) : __assert_fail ("VT == MVT::i1 && \"Unexpected type for SETCC node\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36468, __extension__ __PRETTY_FUNCTION__))
;
36469 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36470}
36471
36472/// If this is an add or subtract where one operand is produced by a cmp+setcc,
36473/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36474/// with CMP+{ADC, SBB}.
36475static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36476 bool IsSub = N->getOpcode() == ISD::SUB;
36477 SDValue X = N->getOperand(0);
36478 SDValue Y = N->getOperand(1);
36479
36480 // If this is an add, canonicalize a zext operand to the RHS.
36481 // TODO: Incomplete? What if both sides are zexts?
36482 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36483 Y.getOpcode() != ISD::ZERO_EXTEND)
36484 std::swap(X, Y);
36485
36486 // Look through a one-use zext.
36487 bool PeekedThroughZext = false;
36488 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36489 Y = Y.getOperand(0);
36490 PeekedThroughZext = true;
36491 }
36492
36493 // If this is an add, canonicalize a setcc operand to the RHS.
36494 // TODO: Incomplete? What if both sides are setcc?
36495 // TODO: Should we allow peeking through a zext of the other operand?
36496 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
36497 Y.getOpcode() != X86ISD::SETCC)
36498 std::swap(X, Y);
36499
36500 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
36501 return SDValue();
36502
36503 SDLoc DL(N);
36504 EVT VT = N->getValueType(0);
36505 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
36506
36507 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36508 // the general case below.
36509 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
36510 if (ConstantX) {
36511 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
36512 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
36513 // This is a complicated way to get -1 or 0 from the carry flag:
36514 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36515 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36516 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36517 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36518 Y.getOperand(1));
36519 }
36520
36521 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
36522 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
36523 SDValue EFLAGS = Y->getOperand(1);
36524 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36525 EFLAGS.getValueType().isInteger() &&
36526 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36527 // Swap the operands of a SUB, and we have the same pattern as above.
36528 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
36529 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
36530 SDValue NewSub = DAG.getNode(
36531 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
36532 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36533 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36534 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36535 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36536 NewEFLAGS);
36537 }
36538 }
36539 }
36540
36541 if (CC == X86::COND_B) {
36542 // X + SETB Z --> X + (mask SBB Z, Z)
36543 // X - SETB Z --> X - (mask SBB Z, Z)
36544 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
36545 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
36546 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36547 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36548 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36549 }
36550
36551 if (CC == X86::COND_A) {
36552 SDValue EFLAGS = Y->getOperand(1);
36553 // Try to convert COND_A into COND_B in an attempt to facilitate
36554 // materializing "setb reg".
36555 //
36556 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
36557 // cannot take an immediate as its first operand.
36558 //
36559 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36560 EFLAGS.getValueType().isInteger() &&
36561 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36562 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
36563 EFLAGS.getNode()->getVTList(),
36564 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36565 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36566 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
36567 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36568 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36569 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36570 }
36571 }
36572
36573 if (CC != X86::COND_E && CC != X86::COND_NE)
36574 return SDValue();
36575
36576 SDValue Cmp = Y.getOperand(1);
36577 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
36578 !X86::isZeroNode(Cmp.getOperand(1)) ||
36579 !Cmp.getOperand(0).getValueType().isInteger())
36580 return SDValue();
36581
36582 SDValue Z = Cmp.getOperand(0);
36583 EVT ZVT = Z.getValueType();
36584
36585 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36586 // the general case below.
36587 if (ConstantX) {
36588 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
36589 // fake operands:
36590 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
36591 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
36592 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
36593 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
36594 SDValue Zero = DAG.getConstant(0, DL, ZVT);
36595 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
36596 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
36597 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36598 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36599 SDValue(Neg.getNode(), 1));
36600 }
36601
36602 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
36603 // with fake operands:
36604 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
36605 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
36606 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
36607 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
36608 SDValue One = DAG.getConstant(1, DL, ZVT);
36609 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36610 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36611 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
36612 }
36613 }
36614
36615 // (cmp Z, 1) sets the carry flag if Z is 0.
36616 SDValue One = DAG.getConstant(1, DL, ZVT);
36617 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36618
36619 // Add the flags type for ADC/SBB nodes.
36620 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36621
36622 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
36623 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
36624 if (CC == X86::COND_NE)
36625 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
36626 DAG.getConstant(-1ULL, DL, VT), Cmp1);
36627
36628 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
36629 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
36630 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
36631 DAG.getConstant(0, DL, VT), Cmp1);
36632}
36633
36634static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
36635 const X86Subtarget &Subtarget) {
36636 if (!Subtarget.hasSSE2())
36637 return SDValue();
36638
36639 SDValue MulOp = N->getOperand(0);
36640 SDValue Phi = N->getOperand(1);
36641
36642 if (MulOp.getOpcode() != ISD::MUL)
36643 std::swap(MulOp, Phi);
36644 if (MulOp.getOpcode() != ISD::MUL)
36645 return SDValue();
36646
36647 ShrinkMode Mode;
36648 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
36649 return SDValue();
36650
36651 EVT VT = N->getValueType(0);
36652
36653 unsigned RegSize = 128;
36654 if (Subtarget.hasBWI())
36655 RegSize = 512;
36656 else if (Subtarget.hasAVX2())
36657 RegSize = 256;
36658 unsigned VectorSize = VT.getVectorNumElements() * 16;
36659 // If the vector size is less than 128, or greater than the supported RegSize,
36660 // do not use PMADD.
36661 if (VectorSize < 128 || VectorSize > RegSize)
36662 return SDValue();
36663
36664 SDLoc DL(N);
36665 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
36666 VT.getVectorNumElements());
36667 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36668 VT.getVectorNumElements() / 2);
36669
36670 // Shrink the operands of mul.
36671 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
36672 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
36673
36674 // Madd vector size is half of the original vector size
36675 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
36676 // Fill the rest of the output with 0
36677 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
36678 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
36679 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
36680}
36681
36682static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
36683 const X86Subtarget &Subtarget) {
36684 if (!Subtarget.hasSSE2())
36685 return SDValue();
36686
36687 SDLoc DL(N);
36688 EVT VT = N->getValueType(0);
36689 SDValue Op0 = N->getOperand(0);
36690 SDValue Op1 = N->getOperand(1);
36691
36692 // TODO: There's nothing special about i32, any integer type above i16 should
36693 // work just as well.
36694 if (!VT.isVector() || !VT.isSimple() ||
36695 !(VT.getVectorElementType() == MVT::i32))
36696 return SDValue();
36697
36698 unsigned RegSize = 128;
36699 if (Subtarget.hasBWI())
36700 RegSize = 512;
36701 else if (Subtarget.hasAVX2())
36702 RegSize = 256;
36703
36704 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
36705 // TODO: We should be able to handle larger vectors by splitting them before
36706 // feeding them into several SADs, and then reducing over those.
36707 if (VT.getSizeInBits() / 4 > RegSize)
36708 return SDValue();
36709
36710 // We know N is a reduction add, which means one of its operands is a phi.
36711 // To match SAD, we need the other operand to be a vector select.
36712 SDValue SelectOp, Phi;
36713 if (Op0.getOpcode() == ISD::VSELECT) {
36714 SelectOp = Op0;
36715 Phi = Op1;
36716 } else if (Op1.getOpcode() == ISD::VSELECT) {
36717 SelectOp = Op1;
36718 Phi = Op0;
36719 } else
36720 return SDValue();
36721
36722 // Check whether we have an abs-diff pattern feeding into the select.
36723 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
36724 return SDValue();
36725
36726 // SAD pattern detected. Now build a SAD instruction and an addition for
36727 // reduction. Note that the number of elements of the result of SAD is less
36728 // than the number of elements of its input. Therefore, we could only update
36729 // part of elements in the reduction vector.
36730 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
36731
36732 // The output of PSADBW is a vector of i64.
36733 // We need to turn the vector of i64 into a vector of i32.
36734 // If the reduction vector is at least as wide as the psadbw result, just
36735 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
36736 // anyway.
36737 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
36738 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
36739 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
36740 else
36741 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
36742
36743 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
36744 // Fill the upper elements with zero to match the add width.
36745 SDValue Zero = DAG.getConstant(0, DL, VT);
36746 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
36747 DAG.getIntPtrConstant(0, DL));
36748 }
36749
36750 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
36751}
36752
36753/// Convert vector increment or decrement to sub/add with an all-ones constant:
36754/// add X, <1, 1...> --> sub X, <-1, -1...>
36755/// sub X, <1, 1...> --> add X, <-1, -1...>
36756/// The all-ones vector constant can be materialized using a pcmpeq instruction
36757/// that is commonly recognized as an idiom (has no register dependency), so
36758/// that's better/smaller than loading a splat 1 constant.
36759static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
36760 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::ADD || N
->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36761, __extension__ __PRETTY_FUNCTION__))
36761 "Unexpected opcode for increment/decrement transform")(static_cast <bool> ((N->getOpcode() == ISD::ADD || N
->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && \"Unexpected opcode for increment/decrement transform\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36761, __extension__ __PRETTY_FUNCTION__))
;
36762
36763 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
36764 // out and wait for legalization if we have an unsupported vector length.
36765 EVT VT = N->getValueType(0);
36766 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
36767 return SDValue();
36768
36769 SDNode *N1 = N->getOperand(1).getNode();
36770 APInt SplatVal;
36771 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
36772 !SplatVal.isOneValue())
36773 return SDValue();
36774
36775 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
36776 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
36777 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
36778}
36779
36780static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
36781 const X86Subtarget &Subtarget) {
36782 const SDNodeFlags Flags = N->getFlags();
36783 if (Flags.hasVectorReduction()) {
36784 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
36785 return Sad;
36786 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
36787 return MAdd;
36788 }
36789 EVT VT = N->getValueType(0);
36790 SDValue Op0 = N->getOperand(0);
36791 SDValue Op1 = N->getOperand(1);
36792
36793 // Try to synthesize horizontal adds from adds of shuffles.
36794 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36795 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36796 isHorizontalBinOp(Op0, Op1, true))
36797 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
36798
36799 if (SDValue V = combineIncDecVector(N, DAG))
36800 return V;
36801
36802 return combineAddOrSubToADCOrSBB(N, DAG);
36803}
36804
36805static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
36806 const X86Subtarget &Subtarget) {
36807 SDValue Op0 = N->getOperand(0);
36808 SDValue Op1 = N->getOperand(1);
36809 EVT VT = N->getValueType(0);
36810
36811 // PSUBUS is supported, starting from SSE2, but special preprocessing
36812 // for v8i32 requires umin, which appears in SSE41.
36813 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
36814 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
36815 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
36816 !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
36817 (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
36818 VT == MVT::v8i64)))
36819 return SDValue();
36820
36821 SDValue SubusLHS, SubusRHS;
36822 // Try to find umax(a,b) - b or a - umin(a,b) patterns
36823 // they may be converted to subus(a,b).
36824 // TODO: Need to add IR cannonicialization for this code.
36825 if (Op0.getOpcode() == ISD::UMAX) {
36826 SubusRHS = Op1;
36827 SDValue MaxLHS = Op0.getOperand(0);
36828 SDValue MaxRHS = Op0.getOperand(1);
36829 if (MaxLHS == Op1)
36830 SubusLHS = MaxRHS;
36831 else if (MaxRHS == Op1)
36832 SubusLHS = MaxLHS;
36833 else
36834 return SDValue();
36835 } else if (Op1.getOpcode() == ISD::UMIN) {
36836 SubusLHS = Op0;
36837 SDValue MinLHS = Op1.getOperand(0);
36838 SDValue MinRHS = Op1.getOperand(1);
36839 if (MinLHS == Op0)
36840 SubusRHS = MinRHS;
36841 else if (MinRHS == Op0)
36842 SubusRHS = MinLHS;
36843 else
36844 return SDValue();
36845 } else
36846 return SDValue();
36847
36848 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
36849 // special preprocessing in some cases.
36850 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
36851 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
36852
36853 // Special preprocessing case can be only applied
36854 // if the value was zero extended from 16 bit,
36855 // so we require first 16 bits to be zeros for 32 bit
36856 // values, or first 48 bits for 64 bit values.
36857 KnownBits Known;
36858 DAG.computeKnownBits(SubusLHS, Known);
36859 unsigned NumZeros = Known.countMinLeadingZeros();
36860 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
36861 return SDValue();
36862
36863 EVT ExtType = SubusLHS.getValueType();
36864 EVT ShrinkedType;
36865 if (VT == MVT::v8i32 || VT == MVT::v8i64)
36866 ShrinkedType = MVT::v8i16;
36867 else
36868 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
36869
36870 // If SubusLHS is zeroextended - truncate SubusRHS to it's
36871 // size SubusRHS = umin(0xFFF.., SubusRHS).
36872 SDValue SaturationConst =
36873 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
36874 ShrinkedType.getScalarSizeInBits()),
36875 SDLoc(SubusLHS), ExtType);
36876 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
36877 SaturationConst);
36878 SDValue NewSubusLHS =
36879 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
36880 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
36881 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
36882 NewSubusLHS, NewSubusRHS);
36883 // Zero extend the result, it may be used somewhere as 32 bit,
36884 // if not zext and following trunc will shrink.
36885 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
36886}
36887
36888static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
36889 const X86Subtarget &Subtarget) {
36890 SDValue Op0 = N->getOperand(0);
36891 SDValue Op1 = N->getOperand(1);
36892
36893 // X86 can't encode an immediate LHS of a sub. See if we can push the
36894 // negation into a preceding instruction.
36895 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
36896 // If the RHS of the sub is a XOR with one use and a constant, invert the
36897 // immediate. Then add one to the LHS of the sub so we can turn
36898 // X-Y -> X+~Y+1, saving one register.
36899 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
36900 isa<ConstantSDNode>(Op1.getOperand(1))) {
36901 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
36902 EVT VT = Op0.getValueType();
36903 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
36904 Op1.getOperand(0),
36905 DAG.getConstant(~XorC, SDLoc(Op1), VT));
36906 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
36907 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
36908 }
36909 }
36910
36911 // Try to synthesize horizontal subs from subs of shuffles.
36912 EVT VT = N->getValueType(0);
36913 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36914 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36915 isHorizontalBinOp(Op0, Op1, false))
36916 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
36917
36918 if (SDValue V = combineIncDecVector(N, DAG))
36919 return V;
36920
36921 // Try to create PSUBUS if SUB's argument is max/min
36922 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
36923 return V;
36924
36925 return combineAddOrSubToADCOrSBB(N, DAG);
36926}
36927
36928static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
36929 TargetLowering::DAGCombinerInfo &DCI,
36930 const X86Subtarget &Subtarget) {
36931 if (DCI.isBeforeLegalize())
36932 return SDValue();
36933
36934 SDLoc DL(N);
36935 unsigned Opcode = N->getOpcode();
36936 MVT VT = N->getSimpleValueType(0);
36937 MVT SVT = VT.getVectorElementType();
36938 unsigned NumElts = VT.getVectorNumElements();
36939 unsigned EltSizeInBits = SVT.getSizeInBits();
36940
36941 SDValue Op = N->getOperand(0);
36942 MVT OpVT = Op.getSimpleValueType();
36943 MVT OpEltVT = OpVT.getVectorElementType();
36944 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
36945 unsigned InputBits = OpEltSizeInBits * NumElts;
36946
36947 // Perform any constant folding.
36948 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
36949 APInt UndefElts;
36950 SmallVector<APInt, 64> EltBits;
36951 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
36952 APInt Undefs(NumElts, 0);
36953 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
36954 bool IsZEXT =
36955 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
36956 for (unsigned i = 0; i != NumElts; ++i) {
36957 if (UndefElts[i]) {
36958 Undefs.setBit(i);
36959 continue;
36960 }
36961 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
36962 : EltBits[i].sextOrTrunc(EltSizeInBits);
36963 }
36964 return getConstVector(Vals, Undefs, VT, DAG, DL);
36965 }
36966
36967 // (vzext (bitcast (vzext (x)) -> (vzext x)
36968 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
36969 SDValue V = peekThroughBitcasts(Op);
36970 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
36971 MVT InnerVT = V.getSimpleValueType();
36972 MVT InnerEltVT = InnerVT.getVectorElementType();
36973
36974 // If the element sizes match exactly, we can just do one larger vzext. This
36975 // is always an exact type match as vzext operates on integer types.
36976 if (OpEltVT == InnerEltVT) {
36977 assert(OpVT == InnerVT && "Types must match for vzext!")(static_cast <bool> (OpVT == InnerVT && "Types must match for vzext!"
) ? void (0) : __assert_fail ("OpVT == InnerVT && \"Types must match for vzext!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 36977, __extension__ __PRETTY_FUNCTION__))
;
36978 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
36979 }
36980
36981 // The only other way we can combine them is if only a single element of the
36982 // inner vzext is used in the input to the outer vzext.
36983 if (InnerEltVT.getSizeInBits() < InputBits)
36984 return SDValue();
36985
36986 // In this case, the inner vzext is completely dead because we're going to
36987 // only look at bits inside of the low element. Just do the outer vzext on
36988 // a bitcast of the input to the inner.
36989 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
36990 }
36991
36992 // Check if we can bypass extracting and re-inserting an element of an input
36993 // vector. Essentially:
36994 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
36995 // TODO: Add X86ISD::VSEXT support
36996 if (Opcode == X86ISD::VZEXT &&
36997 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36998 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
36999 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
37000 SDValue ExtractedV = V.getOperand(0);
37001 SDValue OrigV = ExtractedV.getOperand(0);
37002 if (isNullConstant(ExtractedV.getOperand(1))) {
37003 MVT OrigVT = OrigV.getSimpleValueType();
37004 // Extract a subvector if necessary...
37005 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
37006 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
37007 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
37008 OrigVT.getVectorNumElements() / Ratio);
37009 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
37010 DAG.getIntPtrConstant(0, DL));
37011 }
37012 Op = DAG.getBitcast(OpVT, OrigV);
37013 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
37014 }
37015 }
37016
37017 return SDValue();
37018}
37019
37020static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
37021 const X86Subtarget &Subtarget) {
37022 SDValue Op0 = N->getOperand(0);
37023 SDValue Op1 = N->getOperand(1);
37024
37025 MVT VT = N->getSimpleValueType(0);
37026 SDLoc DL(N);
37027
37028 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
37029 if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
37030 return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
37031 Op0->getOperand(1));
37032
37033 // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
37034 // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
37035 if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
37036 ISD::isBuildVectorAllZeros(Op1.getNode()))
37037 return getZeroVector(VT, Subtarget, DAG, DL);
37038
37039 return SDValue();
37040}
37041
37042static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
37043 const X86Subtarget &Subtarget) {
37044 MVT VT = N->getSimpleValueType(0);
37045 SDLoc DL(N);
37046
37047 if (N->getOperand(0) == N->getOperand(1)) {
37048 if (N->getOpcode() == X86ISD::PCMPEQ)
37049 return getOnesVector(VT, DAG, DL);
37050 if (N->getOpcode() == X86ISD::PCMPGT)
37051 return getZeroVector(VT, Subtarget, DAG, DL);
37052 }
37053
37054 return SDValue();
37055}
37056
37057static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
37058 TargetLowering::DAGCombinerInfo &DCI,
37059 const X86Subtarget &Subtarget) {
37060 if (DCI.isBeforeLegalizeOps())
37061 return SDValue();
37062
37063 MVT OpVT = N->getSimpleValueType(0);
37064
37065 // Early out for mask vectors.
37066 if (OpVT.getVectorElementType() == MVT::i1)
37067 return SDValue();
37068
37069 SDLoc dl(N);
37070 SDValue Vec = N->getOperand(0);
37071 SDValue SubVec = N->getOperand(1);
37072
37073 unsigned IdxVal = N->getConstantOperandVal(2);
37074 MVT SubVecVT = SubVec.getSimpleValueType();
37075
37076 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
37077 // Inserting zeros into zeros is a nop.
37078 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37079 return Vec;
37080
37081 // If we're inserting into a zero vector and then into a larger zero vector,
37082 // just insert into the larger zero vector directly.
37083 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37084 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
37085 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
37086 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
37087 SubVec.getOperand(1),
37088 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
37089 }
37090
37091 // If we're inserting a bitcast into zeros, rewrite the insert and move the
37092 // bitcast to the other side. This helps with detecting zero extending
37093 // during isel.
37094 // TODO: Is this useful for other indices than 0?
37095 if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
37096 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
37097 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
37098 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
37099 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
37100 DAG.getBitcast(NewVT, Vec),
37101 SubVec.getOperand(0), N->getOperand(2));
37102 return DAG.getBitcast(OpVT, Insert);
37103 }
37104 }
37105
37106 // If this is an insert of an extract, combine to a shuffle. Don't do this
37107 // if the insert or extract can be represented with a subregister operation.
37108 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37109 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
37110 (IdxVal != 0 || !Vec.isUndef())) {
37111 int ExtIdxVal = SubVec.getConstantOperandVal(1);
37112 if (ExtIdxVal != 0) {
37113 int VecNumElts = OpVT.getVectorNumElements();
37114 int SubVecNumElts = SubVecVT.getVectorNumElements();
37115 SmallVector<int, 64> Mask(VecNumElts);
37116 // First create an identity shuffle mask.
37117 for (int i = 0; i != VecNumElts; ++i)
37118 Mask[i] = i;
37119 // Now insert the extracted portion.
37120 for (int i = 0; i != SubVecNumElts; ++i)
37121 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
37122
37123 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
37124 }
37125 }
37126
37127 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
37128 // load:
37129 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37130 // (load16 addr + 16), Elts/2)
37131 // --> load32 addr
37132 // or:
37133 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37134 // (load32 addr + 32), Elts/2)
37135 // --> load64 addr
37136 // or a 16-byte or 32-byte broadcast:
37137 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37138 // (load16 addr), Elts/2)
37139 // --> X86SubVBroadcast(load16 addr)
37140 // or:
37141 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37142 // (load32 addr), Elts/2)
37143 // --> X86SubVBroadcast(load32 addr)
37144 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
37145 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37146 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
37147 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
37148 if (Idx2 && Idx2->getZExtValue() == 0) {
37149 SDValue SubVec2 = Vec.getOperand(1);
37150 // If needed, look through bitcasts to get to the load.
37151 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
37152 bool Fast;
37153 unsigned Alignment = FirstLd->getAlignment();
37154 unsigned AS = FirstLd->getAddressSpace();
37155 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
37156 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
37157 OpVT, AS, Alignment, &Fast) && Fast) {
37158 SDValue Ops[] = {SubVec2, SubVec};
37159 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
37160 Subtarget, false))
37161 return Ld;
37162 }
37163 }
37164 // If lower/upper loads are the same and the only users of the load, then
37165 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
37166 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
37167 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
37168 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
37169 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
37170
37171 // If this is subv_broadcast insert into both halves, use a larger
37172 // subv_broadcast.
37173 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
37174 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
37175 SubVec.getOperand(0));
37176
37177 // If we're inserting all zeros into the upper half, change this to
37178 // an insert into an all zeros vector. We will match this to a move
37179 // with implicit upper bit zeroing during isel.
37180 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37181 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37182 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
37183 Vec.getOperand(2));
37184
37185 // If we are inserting into both halves of the vector, the starting
37186 // vector should be undef. If it isn't, make it so. Only do this if the
37187 // the early insert has no other uses.
37188 // TODO: Should this be a generic DAG combine?
37189 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
37190 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
37191 SubVec2, Vec.getOperand(2));
37192 DCI.AddToWorklist(Vec.getNode());
37193 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
37194 N->getOperand(2));
37195
37196 }
37197 }
37198 }
37199
37200 return SDValue();
37201}
37202
37203static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
37204 TargetLowering::DAGCombinerInfo &DCI,
37205 const X86Subtarget &Subtarget) {
37206 if (DCI.isBeforeLegalizeOps())
37207 return SDValue();
37208
37209 MVT OpVT = N->getSimpleValueType(0);
37210 SDValue InVec = N->getOperand(0);
37211 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37212
37213 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37214 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37215
37216 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37217 if (OpVT.getScalarType() == MVT::i1)
37218 return DAG.getConstant(1, SDLoc(N), OpVT);
37219 return getOnesVector(OpVT, DAG, SDLoc(N));
37220 }
37221
37222 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37223 return DAG.getBuildVector(
37224 OpVT, SDLoc(N),
37225 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37226
37227 return SDValue();
37228}
37229
37230SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37231 DAGCombinerInfo &DCI) const {
37232 SelectionDAG &DAG = DCI.DAG;
37233 switch (N->getOpcode()) {
37234 default: break;
37235 case ISD::EXTRACT_VECTOR_ELT:
37236 case X86ISD::PEXTRW:
37237 case X86ISD::PEXTRB:
37238 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37239 case ISD::INSERT_SUBVECTOR:
37240 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37241 case ISD::EXTRACT_SUBVECTOR:
37242 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37243 case ISD::VSELECT:
37244 case ISD::SELECT:
37245 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37246 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37247 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37248 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37249 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37250 case X86ISD::SBB: return combineSBB(N, DAG);
37251 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37252 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37253 case ISD::SHL:
37254 case ISD::SRA:
37255 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37256 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37257 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37258 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37259 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37260 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37261 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37262 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37263 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37264 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37265 case ISD::FADD:
37266 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37267 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37268 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37269 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37270 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37271 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37272 case X86ISD::FXOR:
37273 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37274 case X86ISD::FMIN:
37275 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37276 case ISD::FMINNUM:
37277 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37278 case X86ISD::BT: return combineBT(N, DAG, DCI);
37279 case ISD::ANY_EXTEND:
37280 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37281 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37282 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37283 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37284 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37285 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37286 case X86ISD::PACKSS:
37287 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37288 case X86ISD::VSHLI:
37289 case X86ISD::VSRAI:
37290 case X86ISD::VSRLI:
37291 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37292 case ISD::SIGN_EXTEND_VECTOR_INREG:
37293 case ISD::ZERO_EXTEND_VECTOR_INREG:
37294 case X86ISD::VSEXT:
37295 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37296 case X86ISD::PINSRB:
37297 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37298 case X86ISD::SHUFP: // Handle all target specific shuffles
37299 case X86ISD::INSERTPS:
37300 case X86ISD::EXTRQI:
37301 case X86ISD::INSERTQI:
37302 case X86ISD::PALIGNR:
37303 case X86ISD::VSHLDQ:
37304 case X86ISD::VSRLDQ:
37305 case X86ISD::BLENDI:
37306 case X86ISD::UNPCKH:
37307 case X86ISD::UNPCKL:
37308 case X86ISD::MOVHLPS:
37309 case X86ISD::MOVLHPS:
37310 case X86ISD::PSHUFB:
37311 case X86ISD::PSHUFD:
37312 case X86ISD::PSHUFHW:
37313 case X86ISD::PSHUFLW:
37314 case X86ISD::MOVSHDUP:
37315 case X86ISD::MOVSLDUP:
37316 case X86ISD::MOVDDUP:
37317 case X86ISD::MOVSS:
37318 case X86ISD::MOVSD:
37319 case X86ISD::VBROADCAST:
37320 case X86ISD::VPPERM:
37321 case X86ISD::VPERMI:
37322 case X86ISD::VPERMV:
37323 case X86ISD::VPERMV3:
37324 case X86ISD::VPERMIV3:
37325 case X86ISD::VPERMIL2:
37326 case X86ISD::VPERMILPI:
37327 case X86ISD::VPERMILPV:
37328 case X86ISD::VPERM2X128:
37329 case X86ISD::VZEXT_MOVL:
37330 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37331 case X86ISD::FMADD_RND:
37332 case X86ISD::FMADDS1_RND:
37333 case X86ISD::FMADDS3_RND:
37334 case X86ISD::FMADDS1:
37335 case X86ISD::FMADDS3:
37336 case X86ISD::FMADD4S:
37337 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37338 case X86ISD::FMADDSUB_RND:
37339 case X86ISD::FMSUBADD_RND:
37340 case X86ISD::FMADDSUB:
37341 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37342 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
37343 case X86ISD::MGATHER:
37344 case X86ISD::MSCATTER:
37345 case ISD::MGATHER:
37346 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37347 case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
37348 case X86ISD::PCMPEQ:
37349 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37350 }
37351
37352 return SDValue();
37353}
37354
37355/// Return true if the target has native support for the specified value type
37356/// and it is 'desirable' to use the type for the given node type. e.g. On x86
37357/// i16 is legal, but undesirable since i16 instruction encodings are longer and
37358/// some i16 instructions are slow.
37359bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37360 if (!isTypeLegal(VT))
37361 return false;
37362 if (VT != MVT::i16)
37363 return true;
37364
37365 switch (Opc) {
37366 default:
37367 return true;
37368 case ISD::LOAD:
37369 case ISD::SIGN_EXTEND:
37370 case ISD::ZERO_EXTEND:
37371 case ISD::ANY_EXTEND:
37372 case ISD::SHL:
37373 case ISD::SRL:
37374 case ISD::SUB:
37375 case ISD::ADD:
37376 case ISD::MUL:
37377 case ISD::AND:
37378 case ISD::OR:
37379 case ISD::XOR:
37380 return false;
37381 }
37382}
37383
37384/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
37385/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
37386/// we don't adjust the stack we clobber the first frame index.
37387/// See X86InstrInfo::copyPhysReg.
37388static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
37389 const MachineRegisterInfo &MRI = MF.getRegInfo();
37390 return any_of(MRI.reg_instructions(X86::EFLAGS),
37391 [](const MachineInstr &RI) { return RI.isCopy(); });
37392}
37393
37394void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
37395 if (hasCopyImplyingStackAdjustment(MF)) {
37396 MachineFrameInfo &MFI = MF.getFrameInfo();
37397 MFI.setHasCopyImplyingStackAdjustment(true);
37398 }
37399
37400 TargetLoweringBase::finalizeLowering(MF);
37401}
37402
37403/// This method query the target whether it is beneficial for dag combiner to
37404/// promote the specified node. If true, it should return the desired promotion
37405/// type by reference.
37406bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
37407 EVT VT = Op.getValueType();
37408 if (VT != MVT::i16)
37409 return false;
37410
37411 bool Promote = false;
37412 bool Commute = false;
37413 switch (Op.getOpcode()) {
37414 default: break;
37415 case ISD::SIGN_EXTEND:
37416 case ISD::ZERO_EXTEND:
37417 case ISD::ANY_EXTEND:
37418 Promote = true;
37419 break;
37420 case ISD::SHL:
37421 case ISD::SRL: {
37422 SDValue N0 = Op.getOperand(0);
37423 // Look out for (store (shl (load), x)).
37424 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
37425 return false;
37426 Promote = true;
37427 break;
37428 }
37429 case ISD::ADD:
37430 case ISD::MUL:
37431 case ISD::AND:
37432 case ISD::OR:
37433 case ISD::XOR:
37434 Commute = true;
37435 LLVM_FALLTHROUGH[[clang::fallthrough]];
37436 case ISD::SUB: {
37437 SDValue N0 = Op.getOperand(0);
37438 SDValue N1 = Op.getOperand(1);
37439 if (!Commute && MayFoldLoad(N1))
37440 return false;
37441 // Avoid disabling potential load folding opportunities.
37442 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
37443 return false;
37444 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
37445 return false;
37446 Promote = true;
37447 }
37448 }
37449
37450 PVT = MVT::i32;
37451 return Promote;
37452}
37453
37454bool X86TargetLowering::
37455 isDesirableToCombineBuildVectorToShuffleTruncate(
37456 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
37457
37458 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&(static_cast <bool> (SrcVT.getVectorNumElements() == ShuffleMask
.size() && "Element count mismatch") ? void (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37459, __extension__ __PRETTY_FUNCTION__))
37459 "Element count mismatch")(static_cast <bool> (SrcVT.getVectorNumElements() == ShuffleMask
.size() && "Element count mismatch") ? void (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37459, __extension__ __PRETTY_FUNCTION__))
;
37460 assert((static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__))
37461 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&(static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__))
37462 "Shuffle Mask expected to be legal")(static_cast <bool> (Subtarget.getTargetLowering()->
isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"
) ? void (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 37462, __extension__ __PRETTY_FUNCTION__))
;
37463
37464 // For 32-bit elements VPERMD is better than shuffle+truncate.
37465 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
37466 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
37467 return false;
37468
37469 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
37470 return false;
37471
37472 return true;
37473}
37474
37475//===----------------------------------------------------------------------===//
37476// X86 Inline Assembly Support
37477//===----------------------------------------------------------------------===//
37478
37479// Helper to match a string separated by whitespace.
37480static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
37481 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
37482
37483 for (StringRef Piece : Pieces) {
37484 if (!S.startswith(Piece)) // Check if the piece matches.
37485 return false;
37486
37487 S = S.substr(Piece.size());
37488 StringRef::size_type Pos = S.find_first_not_of(" \t");
37489 if (Pos == 0) // We matched a prefix.
37490 return false;
37491
37492 S = S.substr(Pos);
37493 }
37494
37495 return S.empty();
37496}
37497
37498static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
37499
37500 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
37501 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
37502 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
37503 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
37504
37505 if (AsmPieces.size() == 3)
37506 return true;
37507 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
37508 return true;
37509 }
37510 }
37511 return false;
37512}
37513
37514bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
37515 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
37516
37517 const std::string &AsmStr = IA->getAsmString();
37518
37519 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
37520 if (!Ty || Ty->getBitWidth() % 16 != 0)
37521 return false;
37522
37523 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
37524 SmallVector<StringRef, 4> AsmPieces;
37525 SplitString(AsmStr, AsmPieces, ";\n");
37526
37527 switch (AsmPieces.size()) {
37528 default: return false;
37529 case 1:
37530 // FIXME: this should verify that we are targeting a 486 or better. If not,
37531 // we will turn this bswap into something that will be lowered to logical
37532 // ops instead of emitting the bswap asm. For now, we don't support 486 or
37533 // lower so don't worry about this.
37534 // bswap $0
37535 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
37536 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
37537 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
37538 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
37539 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
37540 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
37541 // No need to check constraints, nothing other than the equivalent of
37542 // "=r,0" would be valid here.
37543 return IntrinsicLowering::LowerToByteSwap(CI);
37544 }
37545
37546 // rorw $$8, ${0:w} --> llvm.bswap.i16
37547 if (CI->getType()->isIntegerTy(16) &&
37548 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37549 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
37550 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
37551 AsmPieces.clear();
37552 StringRef ConstraintsStr = IA->getConstraintString();
37553 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37554 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37555 if (clobbersFlagRegisters(AsmPieces))
37556 return IntrinsicLowering::LowerToByteSwap(CI);
37557 }
37558 break;
37559 case 3:
37560 if (CI->getType()->isIntegerTy(32) &&
37561 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37562 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
37563 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
37564 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
37565 AsmPieces.clear();
37566 StringRef ConstraintsStr = IA->getConstraintString();
37567 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37568 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37569 if (clobbersFlagRegisters(AsmPieces))
37570 return IntrinsicLowering::LowerToByteSwap(CI);
37571 }
37572
37573 if (CI->getType()->isIntegerTy(64)) {
37574 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
37575 if (Constraints.size() >= 2 &&
37576 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
37577 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
37578 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
37579 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
37580 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
37581 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
37582 return IntrinsicLowering::LowerToByteSwap(CI);
37583 }
37584 }
37585 break;
37586 }
37587 return false;
37588}
37589
37590/// Given a constraint letter, return the type of constraint for this target.
37591X86TargetLowering::ConstraintType
37592X86TargetLowering::getConstraintType(StringRef Constraint) const {
37593 if (Constraint.size() == 1) {
37594 switch (Constraint[0]) {
37595 case 'R':
37596 case 'q':
37597 case 'Q':
37598 case 'f':
37599 case 't':
37600 case 'u':
37601 case 'y':
37602 case 'x':
37603 case 'v':
37604 case 'Y':
37605 case 'l':
37606 case 'k': // AVX512 masking registers.
37607 return C_RegisterClass;
37608 case 'a':
37609 case 'b':
37610 case 'c':
37611 case 'd':
37612 case 'S':
37613 case 'D':
37614 case 'A':
37615 return C_Register;
37616 case 'I':
37617 case 'J':
37618 case 'K':
37619 case 'L':
37620 case 'M':
37621 case 'N':
37622 case 'G':
37623 case 'C':
37624 case 'e':
37625 case 'Z':
37626 return C_Other;
37627 default:
37628 break;
37629 }
37630 }
37631 else if (Constraint.size() == 2) {
37632 switch (Constraint[0]) {
37633 default:
37634 break;
37635 case 'Y':
37636 switch (Constraint[1]) {
37637 default:
37638 break;
37639 case 'z':
37640 case '0':
37641 return C_Register;
37642 case 'i':
37643 case 'm':
37644 case 'k':
37645 case 't':
37646 case '2':
37647 return C_RegisterClass;
37648 }
37649 }
37650 }
37651 return TargetLowering::getConstraintType(Constraint);
37652}
37653
37654/// Examine constraint type and operand type and determine a weight value.
37655/// This object must already have been set up with the operand type
37656/// and the current alternative constraint selected.
37657TargetLowering::ConstraintWeight
37658 X86TargetLowering::getSingleConstraintMatchWeight(
37659 AsmOperandInfo &info, const char *constraint) const {
37660 ConstraintWeight weight = CW_Invalid;
37661 Value *CallOperandVal = info.CallOperandVal;
37662 // If we don't have a value, we can't do a match,
37663 // but allow it at the lowest weight.
37664 if (!CallOperandVal)
37665 return CW_Default;
37666 Type *type = CallOperandVal->getType();
37667 // Look at the constraint type.
37668 switch (*constraint) {
37669 default:
37670 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
37671 LLVM_FALLTHROUGH[[clang::fallthrough]];
37672 case 'R':
37673 case 'q':
37674 case 'Q':
37675 case 'a':
37676 case 'b':
37677 case 'c':
37678 case 'd':
37679 case 'S':
37680 case 'D':
37681 case 'A':
37682 if (CallOperandVal->getType()->isIntegerTy())
37683 weight = CW_SpecificReg;
37684 break;
37685 case 'f':
37686 case 't':
37687 case 'u':
37688 if (type->isFloatingPointTy())
37689 weight = CW_SpecificReg;
37690 break;
37691 case 'y':
37692 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37693 weight = CW_SpecificReg;
37694 break;
37695 case 'Y': {
37696 unsigned Size = StringRef(constraint).size();
37697 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
37698 char NextChar = Size == 2 ? constraint[1] : 'i';
37699 if (Size > 2)
37700 break;
37701 switch (NextChar) {
37702 default:
37703 return CW_Invalid;
37704 // XMM0
37705 case 'z':
37706 case '0':
37707 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
37708 return CW_SpecificReg;
37709 return CW_Invalid;
37710 // Conditional OpMask regs (AVX512)
37711 case 'k':
37712 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37713 return CW_Register;
37714 return CW_Invalid;
37715 // Any MMX reg
37716 case 'm':
37717 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37718 return weight;
37719 return CW_Invalid;
37720 // Any SSE reg when ISA >= SSE2, same as 'Y'
37721 case 'i':
37722 case 't':
37723 case '2':
37724 if (!Subtarget.hasSSE2())
37725 return CW_Invalid;
37726 break;
37727 }
37728 // Fall through (handle "Y" constraint).
37729 LLVM_FALLTHROUGH[[clang::fallthrough]];
37730 }
37731 case 'v':
37732 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
37733 weight = CW_Register;
37734 LLVM_FALLTHROUGH[[clang::fallthrough]];
37735 case 'x':
37736 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
37737 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
37738 weight = CW_Register;
37739 break;
37740 case 'k':
37741 // Enable conditional vector operations using %k<#> registers.
37742 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37743 weight = CW_Register;
37744 break;
37745 case 'I':
37746 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
37747 if (C->getZExtValue() <= 31)
37748 weight = CW_Constant;
37749 }
37750 break;
37751 case 'J':
37752 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37753 if (C->getZExtValue() <= 63)
37754 weight = CW_Constant;
37755 }
37756 break;
37757 case 'K':
37758 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37759 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
37760 weight = CW_Constant;
37761 }
37762 break;
37763 case 'L':
37764 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37765 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
37766 weight = CW_Constant;
37767 }
37768 break;
37769 case 'M':
37770 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37771 if (C->getZExtValue() <= 3)
37772 weight = CW_Constant;
37773 }
37774 break;
37775 case 'N':
37776 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37777 if (C->getZExtValue() <= 0xff)
37778 weight = CW_Constant;
37779 }
37780 break;
37781 case 'G':
37782 case 'C':
37783 if (isa<ConstantFP>(CallOperandVal)) {
37784 weight = CW_Constant;
37785 }
37786 break;
37787 case 'e':
37788 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37789 if ((C->getSExtValue() >= -0x80000000LL) &&
37790 (C->getSExtValue() <= 0x7fffffffLL))
37791 weight = CW_Constant;
37792 }
37793 break;
37794 case 'Z':
37795 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37796 if (C->getZExtValue() <= 0xffffffff)
37797 weight = CW_Constant;
37798 }
37799 break;
37800 }
37801 return weight;
37802}
37803
37804/// Try to replace an X constraint, which matches anything, with another that
37805/// has more specific requirements based on the type of the corresponding
37806/// operand.
37807const char *X86TargetLowering::
37808LowerXConstraint(EVT ConstraintVT) const {
37809 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
37810 // 'f' like normal targets.
37811 if (ConstraintVT.isFloatingPoint()) {
37812 if (Subtarget.hasSSE2())
37813 return "Y";
37814 if (Subtarget.hasSSE1())
37815 return "x";
37816 }
37817
37818 return TargetLowering::LowerXConstraint(ConstraintVT);
37819}
37820
37821/// Lower the specified operand into the Ops vector.
37822/// If it is invalid, don't add anything to Ops.
37823void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
37824 std::string &Constraint,
37825 std::vector<SDValue>&Ops,
37826 SelectionDAG &DAG) const {
37827 SDValue Result;
37828
37829 // Only support length 1 constraints for now.
37830 if (Constraint.length() > 1) return;
37831
37832 char ConstraintLetter = Constraint[0];
37833 switch (ConstraintLetter) {
37834 default: break;
37835 case 'I':
37836 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37837 if (C->getZExtValue() <= 31) {
37838 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37839 Op.getValueType());
37840 break;
37841 }
37842 }
37843 return;
37844 case 'J':
37845 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37846 if (C->getZExtValue() <= 63) {
37847 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37848 Op.getValueType());
37849 break;
37850 }
37851 }
37852 return;
37853 case 'K':
37854 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37855 if (isInt<8>(C->getSExtValue())) {
37856 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37857 Op.getValueType());
37858 break;
37859 }
37860 }
37861 return;
37862 case 'L':
37863 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37864 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
37865 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
37866 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
37867 Op.getValueType());
37868 break;
37869 }
37870 }
37871 return;
37872 case 'M':
37873 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37874 if (C->getZExtValue() <= 3) {
37875 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37876 Op.getValueType());
37877 break;
37878 }
37879 }
37880 return;
37881 case 'N':
37882 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37883 if (C->getZExtValue() <= 255) {
37884 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37885 Op.getValueType());
37886 break;
37887 }
37888 }
37889 return;
37890 case 'O':
37891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37892 if (C->getZExtValue() <= 127) {
37893 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37894 Op.getValueType());
37895 break;
37896 }
37897 }
37898 return;
37899 case 'e': {
37900 // 32-bit signed value
37901 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37902 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37903 C->getSExtValue())) {
37904 // Widen to 64 bits here to get it sign extended.
37905 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
37906 break;
37907 }
37908 // FIXME gcc accepts some relocatable values here too, but only in certain
37909 // memory models; it's complicated.
37910 }
37911 return;
37912 }
37913 case 'Z': {
37914 // 32-bit unsigned value
37915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37916 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37917 C->getZExtValue())) {
37918 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37919 Op.getValueType());
37920 break;
37921 }
37922 }
37923 // FIXME gcc accepts some relocatable values here too, but only in certain
37924 // memory models; it's complicated.
37925 return;
37926 }
37927 case 'i': {
37928 // Literal immediates are always ok.
37929 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
37930 // Widen to 64 bits here to get it sign extended.
37931 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
37932 break;
37933 }
37934
37935 // In any sort of PIC mode addresses need to be computed at runtime by
37936 // adding in a register or some sort of table lookup. These can't
37937 // be used as immediates.
37938 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
37939 return;
37940
37941 // If we are in non-pic codegen mode, we allow the address of a global (with
37942 // an optional displacement) to be used with 'i'.
37943 GlobalAddressSDNode *GA = nullptr;
37944 int64_t Offset = 0;
37945
37946 // Match either (GA), (GA+C), (GA+C1+C2), etc.
37947 while (1) {
37948 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
37949 Offset += GA->getOffset();
37950 break;
37951 } else if (Op.getOpcode() == ISD::ADD) {
37952 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
37953 Offset += C->getZExtValue();
37954 Op = Op.getOperand(0);
37955 continue;
37956 }
37957 } else if (Op.getOpcode() == ISD::SUB) {
37958 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
37959 Offset += -C->getZExtValue();
37960 Op = Op.getOperand(0);
37961 continue;
37962 }
37963 }
37964
37965 // Otherwise, this isn't something we can handle, reject it.
37966 return;
37967 }
37968
37969 const GlobalValue *GV = GA->getGlobal();
37970 // If we require an extra load to get this address, as in PIC mode, we
37971 // can't accept it.
37972 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
37973 return;
37974
37975 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
37976 GA->getValueType(0), Offset);
37977 break;
37978 }
37979 }
37980
37981 if (Result.getNode()) {
37982 Ops.push_back(Result);
37983 return;
37984 }
37985 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
37986}
37987
37988/// Check if \p RC is a general purpose register class.
37989/// I.e., GR* or one of their variant.
37990static bool isGRClass(const TargetRegisterClass &RC) {
37991 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
37992 RC.hasSuperClassEq(&X86::GR16RegClass) ||
37993 RC.hasSuperClassEq(&X86::GR32RegClass) ||
37994 RC.hasSuperClassEq(&X86::GR64RegClass) ||
37995 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
37996}
37997
37998/// Check if \p RC is a vector register class.
37999/// I.e., FR* / VR* or one of their variant.
38000static bool isFRClass(const TargetRegisterClass &RC) {
38001 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
38002 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
38003 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
38004 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
38005 RC.hasSuperClassEq(&X86::VR512RegClass);
38006}
38007
38008std::pair<unsigned, const TargetRegisterClass *>
38009X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
38010 StringRef Constraint,
38011 MVT VT) const {
38012 // First, see if this is a constraint that directly corresponds to an LLVM
38013 // register class.
38014 if (Constraint.size() == 1) {
38015 // GCC Constraint Letters
38016 switch (Constraint[0]) {
38017 default: break;
38018 // TODO: Slight differences here in allocation order and leaving
38019 // RIP in the class. Do they matter any more here than they do
38020 // in the normal allocation?
38021 case 'k':
38022 if (Subtarget.hasAVX512()) {
38023 // Only supported in AVX512 or later.
38024 switch (VT.SimpleTy) {
38025 default: break;
38026 case MVT::i32:
38027 return std::make_pair(0U, &X86::VK32RegClass);
38028 case MVT::i16:
38029 return std::make_pair(0U, &X86::VK16RegClass);
38030 case MVT::i8:
38031 return std::make_pair(0U, &X86::VK8RegClass);
38032 case MVT::i1:
38033 return std::make_pair(0U, &X86::VK1RegClass);
38034 case MVT::i64:
38035 return std::make_pair(0U, &X86::VK64RegClass);
38036 }
38037 }
38038 break;
38039 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
38040 if (Subtarget.is64Bit()) {
38041 if (VT == MVT::i32 || VT == MVT::f32)
38042 return std::make_pair(0U, &X86::GR32RegClass);
38043 if (VT == MVT::i16)
38044 return std::make_pair(0U, &X86::GR16RegClass);
38045 if (VT == MVT::i8 || VT == MVT::i1)
38046 return std::make_pair(0U, &X86::GR8RegClass);
38047 if (VT == MVT::i64 || VT == MVT::f64)
38048 return std::make_pair(0U, &X86::GR64RegClass);
38049 break;
38050 }
38051 LLVM_FALLTHROUGH[[clang::fallthrough]];
38052 // 32-bit fallthrough
38053 case 'Q': // Q_REGS
38054 if (VT == MVT::i32 || VT == MVT::f32)
38055 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
38056 if (VT == MVT::i16)
38057 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
38058 if (VT == MVT::i8 || VT == MVT::i1)
38059 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
38060 if (VT == MVT::i64)
38061 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
38062 break;
38063 case 'r': // GENERAL_REGS
38064 case 'l': // INDEX_REGS
38065 if (VT == MVT::i8 || VT == MVT::i1)
38066 return std::make_pair(0U, &X86::GR8RegClass);
38067 if (VT == MVT::i16)
38068 return std::make_pair(0U, &X86::GR16RegClass);
38069 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
38070 return std::make_pair(0U, &X86::GR32RegClass);
38071 return std::make_pair(0U, &X86::GR64RegClass);
38072 case 'R': // LEGACY_REGS
38073 if (VT == MVT::i8 || VT == MVT::i1)
38074 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
38075 if (VT == MVT::i16)
38076 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
38077 if (VT == MVT::i32 || !Subtarget.is64Bit())
38078 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
38079 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
38080 case 'f': // FP Stack registers.
38081 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
38082 // value to the correct fpstack register class.
38083 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
38084 return std::make_pair(0U, &X86::RFP32RegClass);
38085 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
38086 return std::make_pair(0U, &X86::RFP64RegClass);
38087 return std::make_pair(0U, &X86::RFP80RegClass);
38088 case 'y': // MMX_REGS if MMX allowed.
38089 if (!Subtarget.hasMMX()) break;
38090 return std::make_pair(0U, &X86::VR64RegClass);
38091 case 'Y': // SSE_REGS if SSE2 allowed
38092 if (!Subtarget.hasSSE2()) break;
38093 LLVM_FALLTHROUGH[[clang::fallthrough]];
38094 case 'v':
38095 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
38096 if (!Subtarget.hasSSE1()) break;
38097 bool VConstraint = (Constraint[0] == 'v');
38098
38099 switch (VT.SimpleTy) {
38100 default: break;
38101 // Scalar SSE types.
38102 case MVT::f32:
38103 case MVT::i32:
38104 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
38105 return std::make_pair(0U, &X86::FR32XRegClass);
38106 return std::make_pair(0U, &X86::FR32RegClass);
38107 case MVT::f64:
38108 case MVT::i64:
38109 if (VConstraint && Subtarget.hasVLX())
38110 return std::make_pair(0U, &X86::FR64XRegClass);
38111 return std::make_pair(0U, &X86::FR64RegClass);
38112 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38113 // Vector types.
38114 case MVT::v16i8:
38115 case MVT::v8i16:
38116 case MVT::v4i32:
38117 case MVT::v2i64:
38118 case MVT::v4f32:
38119 case MVT::v2f64:
38120 if (VConstraint && Subtarget.hasVLX())
38121 return std::make_pair(0U, &X86::VR128XRegClass);
38122 return std::make_pair(0U, &X86::VR128RegClass);
38123 // AVX types.
38124 case MVT::v32i8:
38125 case MVT::v16i16:
38126 case MVT::v8i32:
38127 case MVT::v4i64:
38128 case MVT::v8f32:
38129 case MVT::v4f64:
38130 if (VConstraint && Subtarget.hasVLX())
38131 return std::make_pair(0U, &X86::VR256XRegClass);
38132 return std::make_pair(0U, &X86::VR256RegClass);
38133 case MVT::v8f64:
38134 case MVT::v16f32:
38135 case MVT::v16i32:
38136 case MVT::v8i64:
38137 return std::make_pair(0U, &X86::VR512RegClass);
38138 }
38139 break;
38140 }
38141 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
38142 switch (Constraint[1]) {
38143 default:
38144 break;
38145 case 'i':
38146 case 't':
38147 case '2':
38148 return getRegForInlineAsmConstraint(TRI, "Y", VT);
38149 case 'm':
38150 if (!Subtarget.hasMMX()) break;
38151 return std::make_pair(0U, &X86::VR64RegClass);
38152 case 'z':
38153 case '0':
38154 if (!Subtarget.hasSSE1()) break;
38155 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
38156 case 'k':
38157 // This register class doesn't allocate k0 for masked vector operation.
38158 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
38159 switch (VT.SimpleTy) {
38160 default: break;
38161 case MVT::i32:
38162 return std::make_pair(0U, &X86::VK32WMRegClass);
38163 case MVT::i16:
38164 return std::make_pair(0U, &X86::VK16WMRegClass);
38165 case MVT::i8:
38166 return std::make_pair(0U, &X86::VK8WMRegClass);
38167 case MVT::i1:
38168 return std::make_pair(0U, &X86::VK1WMRegClass);
38169 case MVT::i64:
38170 return std::make_pair(0U, &X86::VK64WMRegClass);
38171 }
38172 }
38173 break;
38174 }
38175 }
38176
38177 // Use the default implementation in TargetLowering to convert the register
38178 // constraint into a member of a register class.
38179 std::pair<unsigned, const TargetRegisterClass*> Res;
38180 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
38181
38182 // Not found as a standard register?
38183 if (!Res.second) {
38184 // Map st(0) -> st(7) -> ST0
38185 if (Constraint.size() == 7 && Constraint[0] == '{' &&
38186 tolower(Constraint[1]) == 's' &&
38187 tolower(Constraint[2]) == 't' &&
38188 Constraint[3] == '(' &&
38189 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
38190 Constraint[5] == ')' &&
38191 Constraint[6] == '}') {
38192
38193 Res.first = X86::FP0+Constraint[4]-'0';
38194 Res.second = &X86::RFP80RegClass;
38195 return Res;
38196 }
38197
38198 // GCC allows "st(0)" to be called just plain "st".
38199 if (StringRef("{st}").equals_lower(Constraint)) {
38200 Res.first = X86::FP0;
38201 Res.second = &X86::RFP80RegClass;
38202 return Res;
38203 }
38204
38205 // flags -> EFLAGS
38206 if (StringRef("{flags}").equals_lower(Constraint)) {
38207 Res.first = X86::EFLAGS;
38208 Res.second = &X86::CCRRegClass;
38209 return Res;
38210 }
38211
38212 // 'A' means [ER]AX + [ER]DX.
38213 if (Constraint == "A") {
38214 if (Subtarget.is64Bit()) {
38215 Res.first = X86::RAX;
38216 Res.second = &X86::GR64_ADRegClass;
38217 } else {
38218 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38219, __extension__ __PRETTY_FUNCTION__))
38219 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38219, __extension__ __PRETTY_FUNCTION__))
;
38220 Res.first = X86::EAX;
38221 Res.second = &X86::GR32_ADRegClass;
38222 }
38223 return Res;
38224 }
38225 return Res;
38226 }
38227
38228 // Otherwise, check to see if this is a register class of the wrong value
38229 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38230 // turn into {ax},{dx}.
38231 // MVT::Other is used to specify clobber names.
38232 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38233 return Res; // Correct type already, nothing to do.
38234
38235 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38236 // return "eax". This should even work for things like getting 64bit integer
38237 // registers when given an f64 type.
38238 const TargetRegisterClass *Class = Res.second;
38239 // The generic code will match the first register class that contains the
38240 // given register. Thus, based on the ordering of the tablegened file,
38241 // the "plain" GR classes might not come first.
38242 // Therefore, use a helper method.
38243 if (isGRClass(*Class)) {
38244 unsigned Size = VT.getSizeInBits();
38245 if (Size == 1) Size = 8;
38246 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38247 if (DestReg > 0) {
38248 bool is64Bit = Subtarget.is64Bit();
38249 const TargetRegisterClass *RC =
38250 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38251 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38252 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38253 : &X86::GR64RegClass;
38254 if (RC->contains(DestReg))
38255 Res = std::make_pair(DestReg, RC);
38256 } else {
38257 // No register found/type mismatch.
38258 Res.first = 0;
38259 Res.second = nullptr;
38260 }
38261 } else if (isFRClass(*Class)) {
38262 // Handle references to XMM physical registers that got mapped into the
38263 // wrong class. This can happen with constraints like {xmm0} where the
38264 // target independent register mapper will just pick the first match it can
38265 // find, ignoring the required type.
38266
38267 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38268 if (VT == MVT::f32 || VT == MVT::i32)
38269 Res.second = &X86::FR32RegClass;
38270 else if (VT == MVT::f64 || VT == MVT::i64)
38271 Res.second = &X86::FR64RegClass;
38272 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38273 Res.second = &X86::VR128RegClass;
38274 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38275 Res.second = &X86::VR256RegClass;
38276 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38277 Res.second = &X86::VR512RegClass;
38278 else {
38279 // Type mismatch and not a clobber: Return an error;
38280 Res.first = 0;
38281 Res.second = nullptr;
38282 }
38283 }
38284
38285 return Res;
38286}
38287
38288int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38289 const AddrMode &AM, Type *Ty,
38290 unsigned AS) const {
38291 // Scaling factors are not free at all.
38292 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38293 // will take 2 allocations in the out of order engine instead of 1
38294 // for plain addressing mode, i.e. inst (reg1).
38295 // E.g.,
38296 // vaddps (%rsi,%drx), %ymm0, %ymm1
38297 // Requires two allocations (one for the load, one for the computation)
38298 // whereas:
38299 // vaddps (%rsi), %ymm0, %ymm1
38300 // Requires just 1 allocation, i.e., freeing allocations for other operations
38301 // and having less micro operations to execute.
38302 //
38303 // For some X86 architectures, this is even worse because for instance for
38304 // stores, the complex addressing mode forces the instruction to use the
38305 // "load" ports instead of the dedicated "store" port.
38306 // E.g., on Haswell:
38307 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38308 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38309 if (isLegalAddressingMode(DL, AM, Ty, AS))
38310 // Scale represents reg2 * scale, thus account for 1
38311 // as soon as we use a second register.
38312 return AM.Scale != 0;
38313 return -1;
38314}
38315
38316bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38317 // Integer division on x86 is expensive. However, when aggressively optimizing
38318 // for code size, we prefer to use a div instruction, as it is usually smaller
38319 // than the alternative sequence.
38320 // The exception to this is vector division. Since x86 doesn't have vector
38321 // integer division, leaving the division as-is is a loss even in terms of
38322 // size, because it will have to be scalarized, while the alternative code
38323 // sequence can be performed in vector form.
38324 bool OptSize =
38325 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38326 return OptSize && !VT.isVector();
38327}
38328
38329void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38330 if (!Subtarget.is64Bit())
38331 return;
38332
38333 // Update IsSplitCSR in X86MachineFunctionInfo.
38334 X86MachineFunctionInfo *AFI =
38335 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38336 AFI->setIsSplitCSR(true);
38337}
38338
38339void X86TargetLowering::insertCopiesSplitCSR(
38340 MachineBasicBlock *Entry,
38341 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38342 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38343 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38344 if (!IStart)
38345 return;
38346
38347 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38348 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38349 MachineBasicBlock::iterator MBBI = Entry->begin();
38350 for (const MCPhysReg *I = IStart; *I; ++I) {
38351 const TargetRegisterClass *RC = nullptr;
38352 if (X86::GR64RegClass.contains(*I))
38353 RC = &X86::GR64RegClass;
38354 else
38355 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38355)
;
38356
38357 unsigned NewVR = MRI->createVirtualRegister(RC);
38358 // Create copy from CSR to a virtual register.
38359 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38360 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38361 // nounwind. If we want to generalize this later, we may need to emit
38362 // CFI pseudo-instructions.
38363 assert(Entry->getParent()->getFunction()->hasFnAttribute((static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__))
38364 Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__))
38365 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
()->hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction()->hasFnAttribute( Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/lib/Target/X86/X86ISelLowering.cpp"
, 38365, __extension__ __PRETTY_FUNCTION__))
;
38366 Entry->addLiveIn(*I);
38367 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38368 .addReg(*I);
38369
38370 // Insert the copy-back instructions right before the terminator.
38371 for (auto *Exit : Exits)
38372 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38373 TII->get(TargetOpcode::COPY), *I)
38374 .addReg(NewVR);
38375 }
38376}
38377
38378bool X86TargetLowering::supportSwiftError() const {
38379 return Subtarget.is64Bit();
38380}
38381
38382/// Returns the name of the symbol used to emit stack probes or the empty
38383/// string if not applicable.
38384StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
38385 // If the function specifically requests stack probes, emit them.
38386 if (MF.getFunction()->hasFnAttribute("probe-stack"))
38387 return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
38388
38389 // Generally, if we aren't on Windows, the platform ABI does not include
38390 // support for stack probes, so don't emit them.
38391 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
38392 return "";
38393
38394 // We need a stack probe to conform to the Windows ABI. Choose the right
38395 // symbol.
38396 if (Subtarget.is64Bit())
38397 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
38398 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
38399}

/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h

1//===- llvm/ADT/SmallBitVector.h - 'Normally small' bit vectors -*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the SmallBitVector class.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_SMALLBITVECTOR_H
15#define LLVM_ADT_SMALLBITVECTOR_H
16
17#include "llvm/ADT/BitVector.h"
18#include "llvm/ADT/iterator_range.h"
19#include "llvm/Support/MathExtras.h"
20#include <algorithm>
21#include <cassert>
22#include <climits>
23#include <cstddef>
24#include <cstdint>
25#include <limits>
26#include <utility>
27
28namespace llvm {
29
30/// This is a 'bitvector' (really, a variable-sized bit array), optimized for
31/// the case when the array is small. It contains one pointer-sized field, which
32/// is directly used as a plain collection of bits when possible, or as a
33/// pointer to a larger heap-allocated array when necessary. This allows normal
34/// "small" cases to be fast without losing generality for large inputs.
35class SmallBitVector {
36 // TODO: In "large" mode, a pointer to a BitVector is used, leading to an
37 // unnecessary level of indirection. It would be more efficient to use a
38 // pointer to memory containing size, allocation size, and the array of bits.
39 uintptr_t X = 1;
40
41 enum {
42 // The number of bits in this class.
43 NumBaseBits = sizeof(uintptr_t) * CHAR_BIT8,
44
45 // One bit is used to discriminate between small and large mode. The
46 // remaining bits are used for the small-mode representation.
47 SmallNumRawBits = NumBaseBits - 1,
48
49 // A few more bits are used to store the size of the bit set in small mode.
50 // Theoretically this is a ceil-log2. These bits are encoded in the most
51 // significant bits of the raw bits.
52 SmallNumSizeBits = (NumBaseBits == 32 ? 5 :
53 NumBaseBits == 64 ? 6 :
54 SmallNumRawBits),
55
56 // The remaining bits are used to store the actual set in small mode.
57 SmallNumDataBits = SmallNumRawBits - SmallNumSizeBits
58 };
59
60 static_assert(NumBaseBits == 64 || NumBaseBits == 32,
61 "Unsupported word size");
62
63public:
64 using size_type = unsigned;
65
66 // Encapsulation of a single bit.
67 class reference {
68 SmallBitVector &TheVector;
69 unsigned BitPos;
70
71 public:
72 reference(SmallBitVector &b, unsigned Idx) : TheVector(b), BitPos(Idx) {}
73
74 reference(const reference&) = default;
75
76 reference& operator=(reference t) {
77 *this = bool(t);
78 return *this;
79 }
80
81 reference& operator=(bool t) {
82 if (t)
83 TheVector.set(BitPos);
84 else
85 TheVector.reset(BitPos);
86 return *this;
87 }
88
89 operator bool() const {
90 return const_cast<const SmallBitVector &>(TheVector).operator[](BitPos);
91 }
92 };
93
94private:
95 bool isSmall() const {
96 return X & uintptr_t(1);
97 }
98
99 BitVector *getPointer() const {
100 assert(!isSmall())(static_cast <bool> (!isSmall()) ? void (0) : __assert_fail
("!isSmall()", "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 100, __extension__ __PRETTY_FUNCTION__))
;
101 return reinterpret_cast<BitVector *>(X);
102 }
103
104 void switchToSmall(uintptr_t NewSmallBits, size_t NewSize) {
105 X = 1;
106 setSmallSize(NewSize);
107 setSmallBits(NewSmallBits);
108 }
109
110 void switchToLarge(BitVector *BV) {
111 X = reinterpret_cast<uintptr_t>(BV);
112 assert(!isSmall() && "Tried to use an unaligned pointer")(static_cast <bool> (!isSmall() && "Tried to use an unaligned pointer"
) ? void (0) : __assert_fail ("!isSmall() && \"Tried to use an unaligned pointer\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 112, __extension__ __PRETTY_FUNCTION__))
;
113 }
114
115 // Return all the bits used for the "small" representation; this includes
116 // bits for the size as well as the element bits.
117 uintptr_t getSmallRawBits() const {
118 assert(isSmall())(static_cast <bool> (isSmall()) ? void (0) : __assert_fail
("isSmall()", "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 118, __extension__ __PRETTY_FUNCTION__))
;
119 return X >> 1;
120 }
121
122 void setSmallRawBits(uintptr_t NewRawBits) {
123 assert(isSmall())(static_cast <bool> (isSmall()) ? void (0) : __assert_fail
("isSmall()", "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 123, __extension__ __PRETTY_FUNCTION__))
;
124 X = (NewRawBits << 1) | uintptr_t(1);
125 }
126
127 // Return the size.
128 size_t getSmallSize() const { return getSmallRawBits() >> SmallNumDataBits; }
129
130 void setSmallSize(size_t Size) {
131 setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits));
132 }
133
134 // Return the element bits.
135 uintptr_t getSmallBits() const {
136 return getSmallRawBits() & ~(~uintptr_t(0) << getSmallSize());
137 }
138
139 void setSmallBits(uintptr_t NewBits) {
140 setSmallRawBits((NewBits & ~(~uintptr_t(0) << getSmallSize())) |
141 (getSmallSize() << SmallNumDataBits));
142 }
143
144public:
145 /// Creates an empty bitvector.
146 SmallBitVector() = default;
147
148 /// Creates a bitvector of specified number of bits. All bits are initialized
149 /// to the specified value.
150 explicit SmallBitVector(unsigned s, bool t = false) {
151 if (s <= SmallNumDataBits)
7
Assuming 's' is > SmallNumDataBits
8
Taking false branch
152 switchToSmall(t ? ~uintptr_t(0) : 0, s);
153 else
154 switchToLarge(new BitVector(s, t));
9
Memory is allocated
155 }
156
157 /// SmallBitVector copy ctor.
158 SmallBitVector(const SmallBitVector &RHS) {
159 if (RHS.isSmall())
160 X = RHS.X;
161 else
162 switchToLarge(new BitVector(*RHS.getPointer()));
163 }
164
165 SmallBitVector(SmallBitVector &&RHS) : X(RHS.X) {
166 RHS.X = 1;
167 }
168
169 ~SmallBitVector() {
170 if (!isSmall())
171 delete getPointer();
172 }
173
174 using const_set_bits_iterator = const_set_bits_iterator_impl<SmallBitVector>;
175 using set_iterator = const_set_bits_iterator;
176
177 const_set_bits_iterator set_bits_begin() const {
178 return const_set_bits_iterator(*this);
179 }
180
181 const_set_bits_iterator set_bits_end() const {
182 return const_set_bits_iterator(*this, -1);
183 }
184
185 iterator_range<const_set_bits_iterator> set_bits() const {
186 return make_range(set_bits_begin(), set_bits_end());
187 }
188
189 /// Tests whether there are no bits in this bitvector.
190 bool empty() const {
191 return isSmall() ? getSmallSize() == 0 : getPointer()->empty();
192 }
193
194 /// Returns the number of bits in this bitvector.
195 size_t size() const {
196 return isSmall() ? getSmallSize() : getPointer()->size();
197 }
198
199 /// Returns the number of bits which are set.
200 size_type count() const {
201 if (isSmall()) {
202 uintptr_t Bits = getSmallBits();
203 return countPopulation(Bits);
204 }
205 return getPointer()->count();
206 }
207
208 /// Returns true if any bit is set.
209 bool any() const {
210 if (isSmall())
211 return getSmallBits() != 0;
212 return getPointer()->any();
213 }
214
215 /// Returns true if all bits are set.
216 bool all() const {
217 if (isSmall())
218 return getSmallBits() == (uintptr_t(1) << getSmallSize()) - 1;
219 return getPointer()->all();
220 }
221
222 /// Returns true if none of the bits are set.
223 bool none() const {
224 if (isSmall())
225 return getSmallBits() == 0;
226 return getPointer()->none();
227 }
228
229 /// Returns the index of the first set bit, -1 if none of the bits are set.
230 int find_first() const {
231 if (isSmall()) {
232 uintptr_t Bits = getSmallBits();
233 if (Bits == 0)
234 return -1;
235 return countTrailingZeros(Bits);
236 }
237 return getPointer()->find_first();
238 }
239
240 int find_last() const {
241 if (isSmall()) {
242 uintptr_t Bits = getSmallBits();
243 if (Bits == 0)
244 return -1;
245 return NumBaseBits - countLeadingZeros(Bits);
246 }
247 return getPointer()->find_last();
248 }
249
250 /// Returns the index of the first unset bit, -1 if all of the bits are set.
251 int find_first_unset() const {
252 if (isSmall()) {
253 if (count() == getSmallSize())
254 return -1;
255
256 uintptr_t Bits = getSmallBits();
257 return countTrailingOnes(Bits);
258 }
259 return getPointer()->find_first_unset();
260 }
261
262 int find_last_unset() const {
263 if (isSmall()) {
264 if (count() == getSmallSize())
265 return -1;
266
267 uintptr_t Bits = getSmallBits();
268 return NumBaseBits - countLeadingOnes(Bits);
269 }
270 return getPointer()->find_last_unset();
271 }
272
273 /// Returns the index of the next set bit following the "Prev" bit.
274 /// Returns -1 if the next set bit is not found.
275 int find_next(unsigned Prev) const {
276 if (isSmall()) {
277 uintptr_t Bits = getSmallBits();
278 // Mask off previous bits.
279 Bits &= ~uintptr_t(0) << (Prev + 1);
280 if (Bits == 0 || Prev + 1 >= getSmallSize())
281 return -1;
282 return countTrailingZeros(Bits);
283 }
284 return getPointer()->find_next(Prev);
285 }
286
287 /// Returns the index of the next unset bit following the "Prev" bit.
288 /// Returns -1 if the next unset bit is not found.
289 int find_next_unset(unsigned Prev) const {
290 if (isSmall()) {
291 ++Prev;
292 uintptr_t Bits = getSmallBits();
293 // Mask in previous bits.
294 uintptr_t Mask = (1 << Prev) - 1;
295 Bits |= Mask;
296
297 if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())
298 return -1;
299 return countTrailingOnes(Bits);
300 }
301 return getPointer()->find_next_unset(Prev);
302 }
303
304 /// find_prev - Returns the index of the first set bit that precedes the
305 /// the bit at \p PriorTo. Returns -1 if all previous bits are unset.
306 int find_prev(unsigned PriorTo) const {
307 if (isSmall()) {
308 if (PriorTo == 0)
309 return -1;
310
311 --PriorTo;
312 uintptr_t Bits = getSmallBits();
313 Bits &= maskTrailingOnes<uintptr_t>(PriorTo + 1);
314 if (Bits == 0)
315 return -1;
316
317 return NumBaseBits - countLeadingZeros(Bits) - 1;
318 }
319 return getPointer()->find_prev(PriorTo);
320 }
321
322 /// Clear all bits.
323 void clear() {
324 if (!isSmall())
325 delete getPointer();
326 switchToSmall(0, 0);
327 }
328
329 /// Grow or shrink the bitvector.
330 void resize(unsigned N, bool t = false) {
331 if (!isSmall()) {
332 getPointer()->resize(N, t);
333 } else if (SmallNumDataBits >= N) {
334 uintptr_t NewBits = t ? ~uintptr_t(0) << getSmallSize() : 0;
335 setSmallSize(N);
336 setSmallBits(NewBits | getSmallBits());
337 } else {
338 BitVector *BV = new BitVector(N, t);
339 uintptr_t OldBits = getSmallBits();
340 for (size_t i = 0, e = getSmallSize(); i != e; ++i)
341 (*BV)[i] = (OldBits >> i) & 1;
342 switchToLarge(BV);
343 }
344 }
345
346 void reserve(unsigned N) {
347 if (isSmall()) {
348 if (N > SmallNumDataBits) {
349 uintptr_t OldBits = getSmallRawBits();
350 size_t SmallSize = getSmallSize();
351 BitVector *BV = new BitVector(SmallSize);
352 for (size_t i = 0; i < SmallSize; ++i)
353 if ((OldBits >> i) & 1)
354 BV->set(i);
355 BV->reserve(N);
356 switchToLarge(BV);
357 }
358 } else {
359 getPointer()->reserve(N);
360 }
361 }
362
363 // Set, reset, flip
364 SmallBitVector &set() {
365 if (isSmall())
366 setSmallBits(~uintptr_t(0));
367 else
368 getPointer()->set();
369 return *this;
370 }
371
372 SmallBitVector &set(unsigned Idx) {
373 if (isSmall()) {
374 assert(Idx <= static_cast<unsigned>((static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__))
375 std::numeric_limits<uintptr_t>::digits) &&(static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__))
376 "undefined behavior")(static_cast <bool> (Idx <= static_cast<unsigned>
( std::numeric_limits<uintptr_t>::digits) && "undefined behavior"
) ? void (0) : __assert_fail ("Idx <= static_cast<unsigned>( std::numeric_limits<uintptr_t>::digits) && \"undefined behavior\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 376, __extension__ __PRETTY_FUNCTION__))
;
377 setSmallBits(getSmallBits() | (uintptr_t(1) << Idx));
378 }
379 else
380 getPointer()->set(Idx);
381 return *this;
382 }
383
384 /// Efficiently set a range of bits in [I, E)
385 SmallBitVector &set(unsigned I, unsigned E) {
386 assert(I <= E && "Attempted to set backwards range!")(static_cast <bool> (I <= E && "Attempted to set backwards range!"
) ? void (0) : __assert_fail ("I <= E && \"Attempted to set backwards range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 386, __extension__ __PRETTY_FUNCTION__))
;
387 assert(E <= size() && "Attempted to set out-of-bounds range!")(static_cast <bool> (E <= size() && "Attempted to set out-of-bounds range!"
) ? void (0) : __assert_fail ("E <= size() && \"Attempted to set out-of-bounds range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 387, __extension__ __PRETTY_FUNCTION__))
;
388 if (I == E) return *this;
389 if (isSmall()) {
390 uintptr_t EMask = ((uintptr_t)1) << E;
391 uintptr_t IMask = ((uintptr_t)1) << I;
392 uintptr_t Mask = EMask - IMask;
393 setSmallBits(getSmallBits() | Mask);
394 } else
395 getPointer()->set(I, E);
396 return *this;
397 }
398
399 SmallBitVector &reset() {
400 if (isSmall())
401 setSmallBits(0);
402 else
403 getPointer()->reset();
404 return *this;
405 }
406
407 SmallBitVector &reset(unsigned Idx) {
408 if (isSmall())
409 setSmallBits(getSmallBits() & ~(uintptr_t(1) << Idx));
410 else
411 getPointer()->reset(Idx);
412 return *this;
413 }
414
415 /// Efficiently reset a range of bits in [I, E)
416 SmallBitVector &reset(unsigned I, unsigned E) {
417 assert(I <= E && "Attempted to reset backwards range!")(static_cast <bool> (I <= E && "Attempted to reset backwards range!"
) ? void (0) : __assert_fail ("I <= E && \"Attempted to reset backwards range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 417, __extension__ __PRETTY_FUNCTION__))
;
418 assert(E <= size() && "Attempted to reset out-of-bounds range!")(static_cast <bool> (E <= size() && "Attempted to reset out-of-bounds range!"
) ? void (0) : __assert_fail ("E <= size() && \"Attempted to reset out-of-bounds range!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 418, __extension__ __PRETTY_FUNCTION__))
;
419 if (I == E) return *this;
420 if (isSmall()) {
421 uintptr_t EMask = ((uintptr_t)1) << E;
422 uintptr_t IMask = ((uintptr_t)1) << I;
423 uintptr_t Mask = EMask - IMask;
424 setSmallBits(getSmallBits() & ~Mask);
425 } else
426 getPointer()->reset(I, E);
427 return *this;
428 }
429
430 SmallBitVector &flip() {
431 if (isSmall())
432 setSmallBits(~getSmallBits());
433 else
434 getPointer()->flip();
435 return *this;
436 }
437
438 SmallBitVector &flip(unsigned Idx) {
439 if (isSmall())
440 setSmallBits(getSmallBits() ^ (uintptr_t(1) << Idx));
441 else
442 getPointer()->flip(Idx);
443 return *this;
444 }
445
446 // No argument flip.
447 SmallBitVector operator~() const {
448 return SmallBitVector(*this).flip();
449 }
450
451 // Indexing.
452 reference operator[](unsigned Idx) {
453 assert(Idx < size() && "Out-of-bounds Bit access.")(static_cast <bool> (Idx < size() && "Out-of-bounds Bit access."
) ? void (0) : __assert_fail ("Idx < size() && \"Out-of-bounds Bit access.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 453, __extension__ __PRETTY_FUNCTION__))
;
454 return reference(*this, Idx);
455 }
456
457 bool operator[](unsigned Idx) const {
458 assert(Idx < size() && "Out-of-bounds Bit access.")(static_cast <bool> (Idx < size() && "Out-of-bounds Bit access."
) ? void (0) : __assert_fail ("Idx < size() && \"Out-of-bounds Bit access.\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 458, __extension__ __PRETTY_FUNCTION__))
;
459 if (isSmall())
460 return ((getSmallBits() >> Idx) & 1) != 0;
461 return getPointer()->operator[](Idx);
462 }
463
464 bool test(unsigned Idx) const {
465 return (*this)[Idx];
466 }
467
468 /// Test if any common bits are set.
469 bool anyCommon(const SmallBitVector &RHS) const {
470 if (isSmall() && RHS.isSmall())
471 return (getSmallBits() & RHS.getSmallBits()) != 0;
472 if (!isSmall() && !RHS.isSmall())
473 return getPointer()->anyCommon(*RHS.getPointer());
474
475 for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
476 if (test(i) && RHS.test(i))
477 return true;
478 return false;
479 }
480
481 // Comparison operators.
482 bool operator==(const SmallBitVector &RHS) const {
483 if (size() != RHS.size())
484 return false;
485 if (isSmall())
486 return getSmallBits() == RHS.getSmallBits();
487 else
488 return *getPointer() == *RHS.getPointer();
489 }
490
491 bool operator!=(const SmallBitVector &RHS) const {
492 return !(*this == RHS);
493 }
494
495 // Intersection, union, disjoint union.
496 SmallBitVector &operator&=(const SmallBitVector &RHS) {
497 resize(std::max(size(), RHS.size()));
498 if (isSmall())
499 setSmallBits(getSmallBits() & RHS.getSmallBits());
500 else if (!RHS.isSmall())
501 getPointer()->operator&=(*RHS.getPointer());
502 else {
503 SmallBitVector Copy = RHS;
504 Copy.resize(size());
505 getPointer()->operator&=(*Copy.getPointer());
506 }
507 return *this;
508 }
509
510 /// Reset bits that are set in RHS. Same as *this &= ~RHS.
511 SmallBitVector &reset(const SmallBitVector &RHS) {
512 if (isSmall() && RHS.isSmall())
513 setSmallBits(getSmallBits() & ~RHS.getSmallBits());
514 else if (!isSmall() && !RHS.isSmall())
515 getPointer()->reset(*RHS.getPointer());
516 else
517 for (unsigned i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
518 if (RHS.test(i))
519 reset(i);
520
521 return *this;
522 }
523
524 /// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().
525 bool test(const SmallBitVector &RHS) const {
526 if (isSmall() && RHS.isSmall())
527 return (getSmallBits() & ~RHS.getSmallBits()) != 0;
528 if (!isSmall() && !RHS.isSmall())
529 return getPointer()->test(*RHS.getPointer());
530
531 unsigned i, e;
532 for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
533 if (test(i) && !RHS.test(i))
534 return true;
535
536 for (e = size(); i != e; ++i)
537 if (test(i))
538 return true;
539
540 return false;
541 }
542
543 SmallBitVector &operator|=(const SmallBitVector &RHS) {
544 resize(std::max(size(), RHS.size()));
545 if (isSmall())
546 setSmallBits(getSmallBits() | RHS.getSmallBits());
547 else if (!RHS.isSmall())
548 getPointer()->operator|=(*RHS.getPointer());
549 else {
550 SmallBitVector Copy = RHS;
551 Copy.resize(size());
552 getPointer()->operator|=(*Copy.getPointer());
553 }
554 return *this;
555 }
556
557 SmallBitVector &operator^=(const SmallBitVector &RHS) {
558 resize(std::max(size(), RHS.size()));
559 if (isSmall())
560 setSmallBits(getSmallBits() ^ RHS.getSmallBits());
561 else if (!RHS.isSmall())
562 getPointer()->operator^=(*RHS.getPointer());
563 else {
564 SmallBitVector Copy = RHS;
565 Copy.resize(size());
566 getPointer()->operator^=(*Copy.getPointer());
567 }
568 return *this;
569 }
570
571 SmallBitVector &operator<<=(unsigned N) {
572 if (isSmall())
573 setSmallBits(getSmallBits() << N);
574 else
575 getPointer()->operator<<=(N);
576 return *this;
577 }
578
579 SmallBitVector &operator>>=(unsigned N) {
580 if (isSmall())
581 setSmallBits(getSmallBits() >> N);
582 else
583 getPointer()->operator>>=(N);
584 return *this;
585 }
586
587 // Assignment operator.
588 const SmallBitVector &operator=(const SmallBitVector &RHS) {
589 if (isSmall()) {
590 if (RHS.isSmall())
591 X = RHS.X;
592 else
593 switchToLarge(new BitVector(*RHS.getPointer()));
594 } else {
595 if (!RHS.isSmall())
596 *getPointer() = *RHS.getPointer();
597 else {
598 delete getPointer();
599 X = RHS.X;
600 }
601 }
602 return *this;
603 }
604
605 const SmallBitVector &operator=(SmallBitVector &&RHS) {
606 if (this != &RHS) {
607 clear();
608 swap(RHS);
609 }
610 return *this;
611 }
612
613 void swap(SmallBitVector &RHS) {
614 std::swap(X, RHS.X);
615 }
616
617 /// Add '1' bits from Mask to this vector. Don't resize.
618 /// This computes "*this |= Mask".
619 void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
620 if (isSmall())
621 applyMask<true, false>(Mask, MaskWords);
622 else
623 getPointer()->setBitsInMask(Mask, MaskWords);
624 }
625
626 /// Clear any bits in this vector that are set in Mask. Don't resize.
627 /// This computes "*this &= ~Mask".
628 void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
629 if (isSmall())
630 applyMask<false, false>(Mask, MaskWords);
631 else
632 getPointer()->clearBitsInMask(Mask, MaskWords);
633 }
634
635 /// Add a bit to this vector for every '0' bit in Mask. Don't resize.
636 /// This computes "*this |= ~Mask".
637 void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
638 if (isSmall())
639 applyMask<true, true>(Mask, MaskWords);
640 else
641 getPointer()->setBitsNotInMask(Mask, MaskWords);
642 }
643
644 /// Clear a bit in this vector for every '0' bit in Mask. Don't resize.
645 /// This computes "*this &= Mask".
646 void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) {
647 if (isSmall())
648 applyMask<false, true>(Mask, MaskWords);
649 else
650 getPointer()->clearBitsNotInMask(Mask, MaskWords);
651 }
652
653private:
654 template <bool AddBits, bool InvertMask>
655 void applyMask(const uint32_t *Mask, unsigned MaskWords) {
656 assert(MaskWords <= sizeof(uintptr_t) && "Mask is larger than base!")(static_cast <bool> (MaskWords <= sizeof(uintptr_t) &&
"Mask is larger than base!") ? void (0) : __assert_fail ("MaskWords <= sizeof(uintptr_t) && \"Mask is larger than base!\""
, "/build/llvm-toolchain-snapshot-6.0~svn320613/include/llvm/ADT/SmallBitVector.h"
, 656, __extension__ __PRETTY_FUNCTION__))
;
657 uintptr_t M = Mask[0];
658 if (NumBaseBits == 64)
659 M |= uint64_t(Mask[1]) << 32;
660 if (InvertMask)
661 M = ~M;
662 if (AddBits)
663 setSmallBits(getSmallBits() | M);
664 else
665 setSmallBits(getSmallBits() & ~M);
666 }
667};
668
669inline SmallBitVector
670operator&(const SmallBitVector &LHS, const SmallBitVector &RHS) {
671 SmallBitVector Result(LHS);
672 Result &= RHS;
673 return Result;
674}
675
676inline SmallBitVector
677operator|(const SmallBitVector &LHS, const SmallBitVector &RHS) {
678 SmallBitVector Result(LHS);
679 Result |= RHS;
680 return Result;
681}
682
683inline SmallBitVector
684operator^(const SmallBitVector &LHS, const SmallBitVector &RHS) {
685 SmallBitVector Result(LHS);
686 Result ^= RHS;
687 return Result;
688}
689
690} // end namespace llvm
691
692namespace std {
693
694/// Implement std::swap in terms of BitVector swap.
695inline void
696swap(llvm::SmallBitVector &LHS, llvm::SmallBitVector &RHS) {
697 LHS.swap(RHS);
698}
699
700} // end namespace std
701
702#endif // LLVM_ADT_SMALLBITVECTOR_H