Bug Summary

File:llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 33158, column 45
The result of the right shift is undefined due to shifting by '64', which is greater or equal to the width of type 'size_t'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/include -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-12-11-181444-25759-1 -x c++ /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "Utils/X86ShuffleDecode.h"
16#include "X86CallingConv.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86TargetMachine.h"
22#include "X86TargetObjectFile.h"
23#include "llvm/ADT/SmallBitVector.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringExtras.h"
27#include "llvm/ADT/StringSwitch.h"
28#include "llvm/Analysis/EHPersonalities.h"
29#include "llvm/CodeGen/IntrinsicLowering.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineJumpTableInfo.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/TargetLowering.h"
37#include "llvm/CodeGen/WinEHFuncInfo.h"
38#include "llvm/IR/CallSite.h"
39#include "llvm/IR/CallingConv.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DerivedTypes.h"
42#include "llvm/IR/DiagnosticInfo.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/GlobalAlias.h"
45#include "llvm/IR/GlobalVariable.h"
46#include "llvm/IR/Instructions.h"
47#include "llvm/IR/Intrinsics.h"
48#include "llvm/MC/MCAsmInfo.h"
49#include "llvm/MC/MCContext.h"
50#include "llvm/MC/MCExpr.h"
51#include "llvm/MC/MCSymbol.h"
52#include "llvm/Support/CommandLine.h"
53#include "llvm/Support/Debug.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/KnownBits.h"
56#include "llvm/Support/MathExtras.h"
57#include "llvm/Target/TargetOptions.h"
58#include <algorithm>
59#include <bitset>
60#include <cctype>
61#include <numeric>
62using namespace llvm;
63
64#define DEBUG_TYPE"x86-isel" "x86-isel"
65
66STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
67
68static cl::opt<int> ExperimentalPrefLoopAlignment(
69 "x86-experimental-pref-loop-alignment", cl::init(4),
70 cl::desc(
71 "Sets the preferable loop alignment for experiments (as log2 bytes)"
72 "(the last x86-experimental-pref-loop-alignment bits"
73 " of the loop header PC will be 0)."),
74 cl::Hidden);
75
76// Added in 10.0.
77static cl::opt<bool> EnableOldKNLABI(
78 "x86-enable-old-knl-abi", cl::init(false),
79 cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
80 "one ZMM register on AVX512F, but not AVX512BW targets."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108 const X86Subtarget &STI)
109 : TargetLowering(TM), Subtarget(STI) {
110 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111 X86ScalarSSEf64 = Subtarget.hasSSE2();
112 X86ScalarSSEf32 = Subtarget.hasSSE1();
113 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
114
115 // Set up the TargetLowering object.
116
117 // X86 is weird. It always uses i8 for shift amounts and setcc results.
118 setBooleanContents(ZeroOrOneBooleanContent);
119 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
120 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
121
122 // For 64-bit, since we have so many registers, use the ILP scheduler.
123 // For 32-bit, use the register pressure specific scheduling.
124 // For Atom, always use ILP scheduling.
125 if (Subtarget.isAtom())
126 setSchedulingPreference(Sched::ILP);
127 else if (Subtarget.is64Bit())
128 setSchedulingPreference(Sched::ILP);
129 else
130 setSchedulingPreference(Sched::RegPressure);
131 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
132 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
133
134 // Bypass expensive divides and use cheaper ones.
135 if (TM.getOptLevel() >= CodeGenOpt::Default) {
136 if (Subtarget.hasSlowDivide32())
137 addBypassSlowDiv(32, 8);
138 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
139 addBypassSlowDiv(64, 32);
140 }
141
142 if (Subtarget.isTargetWindowsMSVC() ||
143 Subtarget.isTargetWindowsItanium()) {
144 // Setup Windows compiler runtime calls.
145 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
146 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
147 setLibcallName(RTLIB::SREM_I64, "_allrem");
148 setLibcallName(RTLIB::UREM_I64, "_aullrem");
149 setLibcallName(RTLIB::MUL_I64, "_allmul");
150 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
151 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
152 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
153 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
154 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
155 }
156
157 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
158 // MSVCRT doesn't have powi; fall back to pow
159 setLibcallName(RTLIB::POWI_F32, nullptr);
160 setLibcallName(RTLIB::POWI_F64, nullptr);
161 }
162
163 if (Subtarget.isTargetDarwin()) {
164 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
165 setUseUnderscoreSetJmp(false);
166 setUseUnderscoreLongJmp(false);
167 } else if (Subtarget.isTargetWindowsGNU()) {
168 // MS runtime is weird: it exports _setjmp, but longjmp!
169 setUseUnderscoreSetJmp(true);
170 setUseUnderscoreLongJmp(false);
171 } else {
172 setUseUnderscoreSetJmp(true);
173 setUseUnderscoreLongJmp(true);
174 }
175
176 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
177 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
178 // FIXME: Should we be limitting the atomic size on other configs? Default is
179 // 1024.
180 if (!Subtarget.hasCmpxchg8b())
181 setMaxAtomicSizeInBitsSupported(32);
182
183 // Set up the register classes.
184 addRegisterClass(MVT::i8, &X86::GR8RegClass);
185 addRegisterClass(MVT::i16, &X86::GR16RegClass);
186 addRegisterClass(MVT::i32, &X86::GR32RegClass);
187 if (Subtarget.is64Bit())
188 addRegisterClass(MVT::i64, &X86::GR64RegClass);
189
190 for (MVT VT : MVT::integer_valuetypes())
191 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
192
193 // We don't accept any truncstore of integer registers.
194 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
195 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
196 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
197 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
198 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
199 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
200
201 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
202
203 // SETOEQ and SETUNE require checking two conditions.
204 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
205 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
206 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
207 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
208 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
209 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
210
211 // Integer absolute.
212 if (Subtarget.hasCMov()) {
213 setOperationAction(ISD::ABS , MVT::i16 , Custom);
214 setOperationAction(ISD::ABS , MVT::i32 , Custom);
215 }
216 setOperationAction(ISD::ABS , MVT::i64 , Custom);
217
218 // Funnel shifts.
219 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
220 setOperationAction(ShiftOp , MVT::i16 , Custom);
221 setOperationAction(ShiftOp , MVT::i32 , Custom);
222 if (Subtarget.is64Bit())
223 setOperationAction(ShiftOp , MVT::i64 , Custom);
224 }
225
226 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
227 // operation.
228 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
229 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
230 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
231
232 if (!Subtarget.useSoftFloat()) {
233 // We have an algorithm for SSE2->double, and we turn this into a
234 // 64-bit FILD followed by conditional FADD for other targets.
235 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
236 // We have an algorithm for SSE2, and we turn this into a 64-bit
237 // FILD or VCVTUSI2SS/SD for other targets.
238 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
239 } else {
240 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
241 }
242
243 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
244 // this operation.
245 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
246 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
247
248 if (!Subtarget.useSoftFloat()) {
249 // SSE has no i16 to fp conversion, only i32.
250 if (X86ScalarSSEf32) {
251 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
252 // f32 and f64 cases are Legal, f80 case is not
253 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
254 } else {
255 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
256 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
257 }
258 } else {
259 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
260 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
261 }
262
263 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
264 // this operation.
265 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
266 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
267
268 if (!Subtarget.useSoftFloat()) {
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
271 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
272 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
273
274 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
275 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
276 } else {
277 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
278 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
279 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
280 }
281
282 // Handle FP_TO_UINT by promoting the destination to a larger signed
283 // conversion.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
285 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
286 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
287
288 if (!Subtarget.useSoftFloat()) {
289 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
290 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
291 }
292
293 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
294 if (!X86ScalarSSEf64) {
295 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
296 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
297 if (Subtarget.is64Bit()) {
298 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
299 // Without SSE, i64->f64 goes through memory.
300 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
301 }
302 } else if (!Subtarget.is64Bit())
303 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
304
305 // Scalar integer divide and remainder are lowered to use operations that
306 // produce two results, to match the available instructions. This exposes
307 // the two-result form to trivial CSE, which is able to combine x/y and x%y
308 // into a single instruction.
309 //
310 // Scalar integer multiply-high is also lowered to use two-result
311 // operations, to match the available instructions. However, plain multiply
312 // (low) operations are left as Legal, as there are single-result
313 // instructions for this in x86. Using the two-result multiply instructions
314 // when both high and low results are needed must be arranged by dagcombine.
315 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
316 setOperationAction(ISD::MULHS, VT, Expand);
317 setOperationAction(ISD::MULHU, VT, Expand);
318 setOperationAction(ISD::SDIV, VT, Expand);
319 setOperationAction(ISD::UDIV, VT, Expand);
320 setOperationAction(ISD::SREM, VT, Expand);
321 setOperationAction(ISD::UREM, VT, Expand);
322 }
323
324 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
325 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
326 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
327 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
328 setOperationAction(ISD::BR_CC, VT, Expand);
329 setOperationAction(ISD::SELECT_CC, VT, Expand);
330 }
331 if (Subtarget.is64Bit())
332 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
333 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
334 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
335 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
336
337 setOperationAction(ISD::FREM , MVT::f32 , Expand);
338 setOperationAction(ISD::FREM , MVT::f64 , Expand);
339 setOperationAction(ISD::FREM , MVT::f80 , Expand);
340 setOperationAction(ISD::FREM , MVT::f128 , Expand);
341 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
342
343 // Promote the i8 variants and force them on up to i32 which has a shorter
344 // encoding.
345 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
346 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
347 if (!Subtarget.hasBMI()) {
348 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
349 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
355 }
356 }
357
358 if (Subtarget.hasLZCNT()) {
359 // When promoting the i8 variants, force them to i32 for a shorter
360 // encoding.
361 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
362 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
363 } else {
364 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
365 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
366 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
367 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
369 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
370 if (Subtarget.is64Bit()) {
371 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
373 }
374 }
375
376 // Special handling for half-precision floating point conversions.
377 // If we don't have F16C support, then lower half float conversions
378 // into library calls.
379 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
380 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
381 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
382 }
383
384 // There's never any support for operations beyond MVT::f32.
385 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
386 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
387 setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
388 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
389 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
390 setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
391
392 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
393 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
396 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
400
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
403 } else {
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
409 else
410 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
411 }
412
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
414
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
417
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
424 }
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
427 continue;
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
430 }
431
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
435
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444
445 // Darwin ABI issue.
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
448 continue;
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
455 }
456
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
460 continue;
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 }
465
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
468
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
470
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 }
481
482 if (!Subtarget.is64Bit())
483 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
484
485 if (Subtarget.hasCmpxchg16b()) {
486 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
487 }
488
489 // FIXME - use subtarget debug flags
490 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
491 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
492 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
493 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
494 }
495
496 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
497 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
498
499 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
500 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
501
502 setOperationAction(ISD::TRAP, MVT::Other, Legal);
503 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
504
505 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
506 setOperationAction(ISD::VASTART , MVT::Other, Custom);
507 setOperationAction(ISD::VAEND , MVT::Other, Expand);
508 bool Is64Bit = Subtarget.is64Bit();
509 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
510 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
511
512 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
513 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
514
515 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
516
517 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
518 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
519 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
520
521 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
522 // f32 and f64 use SSE.
523 // Set up the FP register classes.
524 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
525 : &X86::FR32RegClass);
526 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
527 : &X86::FR64RegClass);
528
529 // Disable f32->f64 extload as we can only generate this in one instruction
530 // under optsize. So its easier to pattern match (fpext (load)) for that
531 // case instead of needing to emit 2 instructions for extload in the
532 // non-optsize case.
533 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
534
535 for (auto VT : { MVT::f32, MVT::f64 }) {
536 // Use ANDPD to simulate FABS.
537 setOperationAction(ISD::FABS, VT, Custom);
538
539 // Use XORP to simulate FNEG.
540 setOperationAction(ISD::FNEG, VT, Custom);
541
542 // Use ANDPD and ORPD to simulate FCOPYSIGN.
543 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
544
545 // These might be better off as horizontal vector ops.
546 setOperationAction(ISD::FADD, VT, Custom);
547 setOperationAction(ISD::FSUB, VT, Custom);
548
549 // We don't support sin/cos/fmod
550 setOperationAction(ISD::FSIN , VT, Expand);
551 setOperationAction(ISD::FCOS , VT, Expand);
552 setOperationAction(ISD::FSINCOS, VT, Expand);
553 }
554
555 // Lower this to MOVMSK plus an AND.
556 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
557 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
558
559 } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
560 // Use SSE for f32, x87 for f64.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f32, &X86::FR32RegClass);
563 if (UseX87)
564 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
565
566 // Use ANDPS to simulate FABS.
567 setOperationAction(ISD::FABS , MVT::f32, Custom);
568
569 // Use XORP to simulate FNEG.
570 setOperationAction(ISD::FNEG , MVT::f32, Custom);
571
572 if (UseX87)
573 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
574
575 // Use ANDPS and ORPS to simulate FCOPYSIGN.
576 if (UseX87)
577 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
578 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
579
580 // We don't support sin/cos/fmod
581 setOperationAction(ISD::FSIN , MVT::f32, Expand);
582 setOperationAction(ISD::FCOS , MVT::f32, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
584
585 if (UseX87) {
586 // Always expand sin/cos functions even though x87 has an instruction.
587 setOperationAction(ISD::FSIN, MVT::f64, Expand);
588 setOperationAction(ISD::FCOS, MVT::f64, Expand);
589 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
590 }
591 } else if (UseX87) {
592 // f32 and f64 in x87.
593 // Set up the FP register classes.
594 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
595 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
596
597 for (auto VT : { MVT::f32, MVT::f64 }) {
598 setOperationAction(ISD::UNDEF, VT, Expand);
599 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
600
601 // Always expand sin/cos functions even though x87 has an instruction.
602 setOperationAction(ISD::FSIN , VT, Expand);
603 setOperationAction(ISD::FCOS , VT, Expand);
604 setOperationAction(ISD::FSINCOS, VT, Expand);
605 }
606 }
607
608 // Expand FP32 immediates into loads from the stack, save special cases.
609 if (isTypeLegal(MVT::f32)) {
610 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
611 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
612 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
613 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
614 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
615 } else // SSE immediates.
616 addLegalFPImmediate(APFloat(+0.0f)); // xorps
617 }
618 // Expand FP64 immediates into loads from the stack, save special cases.
619 if (isTypeLegal(MVT::f64)) {
620 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
621 addLegalFPImmediate(APFloat(+0.0)); // FLD0
622 addLegalFPImmediate(APFloat(+1.0)); // FLD1
623 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
624 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
625 } else // SSE immediates.
626 addLegalFPImmediate(APFloat(+0.0)); // xorpd
627 }
628
629 // We don't support FMA.
630 setOperationAction(ISD::FMA, MVT::f64, Expand);
631 setOperationAction(ISD::FMA, MVT::f32, Expand);
632
633 // f80 always uses X87.
634 if (UseX87) {
635 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
636 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
637 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
638 {
639 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
640 addLegalFPImmediate(TmpFlt); // FLD0
641 TmpFlt.changeSign();
642 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
643
644 bool ignored;
645 APFloat TmpFlt2(+1.0);
646 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
647 &ignored);
648 addLegalFPImmediate(TmpFlt2); // FLD1
649 TmpFlt2.changeSign();
650 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
651 }
652
653 // Always expand sin/cos functions even though x87 has an instruction.
654 setOperationAction(ISD::FSIN , MVT::f80, Expand);
655 setOperationAction(ISD::FCOS , MVT::f80, Expand);
656 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
657
658 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
659 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
660 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
661 setOperationAction(ISD::FRINT, MVT::f80, Expand);
662 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
663 setOperationAction(ISD::FMA, MVT::f80, Expand);
664 setOperationAction(ISD::LROUND, MVT::f80, Expand);
665 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
666 setOperationAction(ISD::LRINT, MVT::f80, Expand);
667 setOperationAction(ISD::LLRINT, MVT::f80, Expand);
668 }
669
670 // f128 uses xmm registers, but most operations require libcalls.
671 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
672 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
673 : &X86::VR128RegClass);
674
675 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
676
677 setOperationAction(ISD::FADD, MVT::f128, Custom);
678 setOperationAction(ISD::FSUB, MVT::f128, Custom);
679 setOperationAction(ISD::FDIV, MVT::f128, Custom);
680 setOperationAction(ISD::FMUL, MVT::f128, Custom);
681 setOperationAction(ISD::FMA, MVT::f128, Expand);
682
683 setOperationAction(ISD::FABS, MVT::f128, Custom);
684 setOperationAction(ISD::FNEG, MVT::f128, Custom);
685 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
686
687 setOperationAction(ISD::FSIN, MVT::f128, Expand);
688 setOperationAction(ISD::FCOS, MVT::f128, Expand);
689 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
690 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
691
692 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
693 // We need to custom handle any FP_ROUND with an f128 input, but
694 // LegalizeDAG uses the result type to know when to run a custom handler.
695 // So we have to list all legal floating point result types here.
696 if (isTypeLegal(MVT::f32)) {
697 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
698 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
699 }
700 if (isTypeLegal(MVT::f64)) {
701 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
702 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
703 }
704 if (isTypeLegal(MVT::f80)) {
705 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
706 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
707 }
708
709 setOperationAction(ISD::SETCC, MVT::f128, Custom);
710
711 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
712 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
713 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
714 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
715 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
716 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
717 }
718
719 // Always use a library call for pow.
720 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
721 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
722 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
723 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
724
725 setOperationAction(ISD::FLOG, MVT::f80, Expand);
726 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
727 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
728 setOperationAction(ISD::FEXP, MVT::f80, Expand);
729 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
730 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
731 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
732
733 // Some FP actions are always expanded for vector types.
734 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
735 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
736 setOperationAction(ISD::FSIN, VT, Expand);
737 setOperationAction(ISD::FSINCOS, VT, Expand);
738 setOperationAction(ISD::FCOS, VT, Expand);
739 setOperationAction(ISD::FREM, VT, Expand);
740 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
741 setOperationAction(ISD::FPOW, VT, Expand);
742 setOperationAction(ISD::FLOG, VT, Expand);
743 setOperationAction(ISD::FLOG2, VT, Expand);
744 setOperationAction(ISD::FLOG10, VT, Expand);
745 setOperationAction(ISD::FEXP, VT, Expand);
746 setOperationAction(ISD::FEXP2, VT, Expand);
747 }
748
749 // First set operation action for all vector types to either promote
750 // (for widening) or expand (for scalarization). Then we will selectively
751 // turn on ones that can be effectively codegen'd.
752 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
753 setOperationAction(ISD::SDIV, VT, Expand);
754 setOperationAction(ISD::UDIV, VT, Expand);
755 setOperationAction(ISD::SREM, VT, Expand);
756 setOperationAction(ISD::UREM, VT, Expand);
757 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
758 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
759 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
760 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
761 setOperationAction(ISD::FMA, VT, Expand);
762 setOperationAction(ISD::FFLOOR, VT, Expand);
763 setOperationAction(ISD::FCEIL, VT, Expand);
764 setOperationAction(ISD::FTRUNC, VT, Expand);
765 setOperationAction(ISD::FRINT, VT, Expand);
766 setOperationAction(ISD::FNEARBYINT, VT, Expand);
767 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
768 setOperationAction(ISD::MULHS, VT, Expand);
769 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
770 setOperationAction(ISD::MULHU, VT, Expand);
771 setOperationAction(ISD::SDIVREM, VT, Expand);
772 setOperationAction(ISD::UDIVREM, VT, Expand);
773 setOperationAction(ISD::CTPOP, VT, Expand);
774 setOperationAction(ISD::CTTZ, VT, Expand);
775 setOperationAction(ISD::CTLZ, VT, Expand);
776 setOperationAction(ISD::ROTL, VT, Expand);
777 setOperationAction(ISD::ROTR, VT, Expand);
778 setOperationAction(ISD::BSWAP, VT, Expand);
779 setOperationAction(ISD::SETCC, VT, Expand);
780 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
781 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
782 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
783 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
784 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
785 setOperationAction(ISD::TRUNCATE, VT, Expand);
786 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
787 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
788 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
789 setOperationAction(ISD::SELECT_CC, VT, Expand);
790 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
791 setTruncStoreAction(InnerVT, VT, Expand);
792
793 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
794 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
795
796 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
797 // types, we have to deal with them whether we ask for Expansion or not.
798 // Setting Expand causes its own optimisation problems though, so leave
799 // them legal.
800 if (VT.getVectorElementType() == MVT::i1)
801 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
802
803 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
804 // split/scalarized right now.
805 if (VT.getVectorElementType() == MVT::f16)
806 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
807 }
808 }
809
810 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
811 // with -msoft-float, disable use of MMX as well.
812 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
813 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
814 // No operations on x86mmx supported, everything uses intrinsics.
815 }
816
817 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
818 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
819 : &X86::VR128RegClass);
820
821 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
822 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
823 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
824 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
826 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
827 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
828 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
829 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
830
831 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
832 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
833
834 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);
835 }
836
837 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
838 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
839 : &X86::VR128RegClass);
840
841 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
842 // registers cannot be used even for integer operations.
843 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
844 : &X86::VR128RegClass);
845 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
846 : &X86::VR128RegClass);
847 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
848 : &X86::VR128RegClass);
849 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
850 : &X86::VR128RegClass);
851
852 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
853 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
854 setOperationAction(ISD::SDIV, VT, Custom);
855 setOperationAction(ISD::SREM, VT, Custom);
856 setOperationAction(ISD::UDIV, VT, Custom);
857 setOperationAction(ISD::UREM, VT, Custom);
858 }
859
860 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
861 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
862 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
863
864 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
865 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
866 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
867 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
868 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
869 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
870 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
871 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
872 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
873 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
874 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
875 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
876 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
877
878 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
879 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
880 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
881 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
882 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
883 }
884
885 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
886 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
887 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
888 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
889 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
890 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
891 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
892 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
893 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
894 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
895 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
896 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
897
898 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
899 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
900 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
901
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SETCC, VT, Custom);
904 setOperationAction(ISD::CTPOP, VT, Custom);
905 setOperationAction(ISD::ABS, VT, Custom);
906
907 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
908 // setcc all the way to isel and prefer SETGT in some isel patterns.
909 setCondCodeAction(ISD::SETLT, VT, Custom);
910 setCondCodeAction(ISD::SETLE, VT, Custom);
911 }
912
913 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
914 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
915 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
916 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
917 setOperationAction(ISD::VSELECT, VT, Custom);
918 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
919 }
920
921 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
922 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
923 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
924 setOperationAction(ISD::VSELECT, VT, Custom);
925
926 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
927 continue;
928
929 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
931 }
932
933 // Custom lower v2i64 and v2f64 selects.
934 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
935 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
936 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
937 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
938 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
939
940 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
941 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
942
943 // Custom legalize these to avoid over promotion or custom promotion.
944 setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
945 setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
946 setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
947 setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
948 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
949 setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
950 setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
951 setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
952 setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
953 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
954
955 // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
956 // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
957 // split again based on the input type, this will cause an AssertSExt i16 to
958 // be emitted instead of an AssertZExt. This will allow packssdw followed by
959 // packuswb to be used to truncate to v8i8. This is necessary since packusdw
960 // isn't available until sse4.1.
961 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
962
963 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
964 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
965
966 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
967
968 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
969 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
970
971 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
972 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
973
974 // We want to legalize this to an f64 load rather than an i64 load on
975 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
976 // store.
977 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
978 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
979 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
980 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
981 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
982 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
983
984 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
985 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
986 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
987 if (!Subtarget.hasAVX512())
988 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
989
990 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
991 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
992 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
993
994 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
995
996 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
997 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
998 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
999 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1000 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1001 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1002
1003 // In the customized shift lowering, the legal v4i32/v2i64 cases
1004 // in AVX2 will be recognized.
1005 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1006 setOperationAction(ISD::SRL, VT, Custom);
1007 setOperationAction(ISD::SHL, VT, Custom);
1008 setOperationAction(ISD::SRA, VT, Custom);
1009 }
1010
1011 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1012 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1013
1014 // With AVX512, expanding (and promoting the shifts) is better.
1015 if (!Subtarget.hasAVX512())
1016 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1017 }
1018
1019 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1020 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1021 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1022 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1023 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1024 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1025 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1026 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1027 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1028
1029 // These might be better off as horizontal vector ops.
1030 setOperationAction(ISD::ADD, MVT::i16, Custom);
1031 setOperationAction(ISD::ADD, MVT::i32, Custom);
1032 setOperationAction(ISD::SUB, MVT::i16, Custom);
1033 setOperationAction(ISD::SUB, MVT::i32, Custom);
1034 }
1035
1036 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1037 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1038 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1039 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1040 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1041 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1042 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1043 }
1044
1045 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1046 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1047 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1048 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1049 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1050 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1051 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1052 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1053
1054 // FIXME: Do we need to handle scalar-to-vector here?
1055 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1056
1057 // We directly match byte blends in the backend as they match the VSELECT
1058 // condition form.
1059 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1060
1061 // SSE41 brings specific instructions for doing vector sign extend even in
1062 // cases where we don't have SRA.
1063 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1064 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1065 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1066 }
1067
1068 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1069 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1070 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1071 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1072 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1073 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1074 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1075 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1076 }
1077
1078 // i8 vectors are custom because the source register and source
1079 // source memory operand types are not the same width.
1080 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1081 }
1082
1083 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1084 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1085 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1086 setOperationAction(ISD::ROTL, VT, Custom);
1087
1088 // XOP can efficiently perform BITREVERSE with VPPERM.
1089 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1090 setOperationAction(ISD::BITREVERSE, VT, Custom);
1091
1092 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1093 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1094 setOperationAction(ISD::BITREVERSE, VT, Custom);
1095 }
1096
1097 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1098 bool HasInt256 = Subtarget.hasInt256();
1099
1100 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1101 : &X86::VR256RegClass);
1102 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1103 : &X86::VR256RegClass);
1104 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1105 : &X86::VR256RegClass);
1106 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1107 : &X86::VR256RegClass);
1108 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1109 : &X86::VR256RegClass);
1110 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1111 : &X86::VR256RegClass);
1112
1113 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1114 setOperationAction(ISD::FFLOOR, VT, Legal);
1115 setOperationAction(ISD::FCEIL, VT, Legal);
1116 setOperationAction(ISD::FTRUNC, VT, Legal);
1117 setOperationAction(ISD::FRINT, VT, Legal);
1118 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1119 setOperationAction(ISD::FNEG, VT, Custom);
1120 setOperationAction(ISD::FABS, VT, Custom);
1121 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1122 }
1123
1124 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1125 // even though v8i16 is a legal type.
1126 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1127 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1128 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1129
1130 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1131
1132 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);
1133
1134 if (!Subtarget.hasAVX512())
1135 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1136
1137 // In the customized shift lowering, the legal v8i32/v4i64 cases
1138 // in AVX2 will be recognized.
1139 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1140 setOperationAction(ISD::SRL, VT, Custom);
1141 setOperationAction(ISD::SHL, VT, Custom);
1142 setOperationAction(ISD::SRA, VT, Custom);
1143 }
1144
1145 // These types need custom splitting if their input is a 128-bit vector.
1146 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1147 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1148 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1149 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1150
1151 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1152 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1153
1154 // With BWI, expanding (and promoting the shifts) is the better.
1155 if (!Subtarget.hasBWI())
1156 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1157
1158 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1159 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1160 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1161 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1162 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1163 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1164
1165 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1166 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1167 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1168 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1169 }
1170
1171 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1172 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1173 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1174 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1175
1176 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1177 setOperationAction(ISD::SETCC, VT, Custom);
1178 setOperationAction(ISD::CTPOP, VT, Custom);
1179 setOperationAction(ISD::CTLZ, VT, Custom);
1180
1181 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1182 // setcc all the way to isel and prefer SETGT in some isel patterns.
1183 setCondCodeAction(ISD::SETLT, VT, Custom);
1184 setCondCodeAction(ISD::SETLE, VT, Custom);
1185 }
1186
1187 if (Subtarget.hasAnyFMA()) {
1188 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1189 MVT::v2f64, MVT::v4f64 })
1190 setOperationAction(ISD::FMA, VT, Legal);
1191 }
1192
1193 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1194 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1195 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1196 }
1197
1198 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1199 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1200 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1201 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1202
1203 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1204 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1205 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1206 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1207 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1208 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1209
1210 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1211 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1212 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1213 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1214 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1215
1216 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1217 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1218 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1219 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1220 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1221 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1222 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1223 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1224
1225 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1226 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1227 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1228 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1229 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1230 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1231 }
1232
1233 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1234 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1235 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1236 }
1237
1238 if (HasInt256) {
1239 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1240 // when we have a 256bit-wide blend with immediate.
1241 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1242
1243 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1244 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1245 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1246 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1247 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1248 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1249 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1250 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1251 }
1252 }
1253
1254 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1255 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1256 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1257 setOperationAction(ISD::MSTORE, VT, Legal);
1258 }
1259
1260 // Extract subvector is special because the value type
1261 // (result) is 128-bit but the source is 256-bit wide.
1262 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1263 MVT::v4f32, MVT::v2f64 }) {
1264 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1265 }
1266
1267 // Custom lower several nodes for 256-bit types.
1268 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1269 MVT::v8f32, MVT::v4f64 }) {
1270 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1271 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1272 setOperationAction(ISD::VSELECT, VT, Custom);
1273 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1274 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1275 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1276 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1277 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1278 setOperationAction(ISD::STORE, VT, Custom);
1279 }
1280
1281 if (HasInt256) {
1282 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1283
1284 // Custom legalize 2x32 to get a little better code.
1285 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1286 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1287
1288 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1289 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1290 setOperationAction(ISD::MGATHER, VT, Custom);
1291 }
1292 }
1293
1294 // This block controls legalization of the mask vector sizes that are
1295 // available with AVX512. 512-bit vectors are in a separate block controlled
1296 // by useAVX512Regs.
1297 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1298 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1299 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1300 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1301 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1302 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1303
1304 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1305 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1306 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1307
1308 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1309 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1310 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1311 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1312 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1313 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1314
1315 // There is no byte sized k-register load or store without AVX512DQ.
1316 if (!Subtarget.hasDQI()) {
1317 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1318 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1319 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1320 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1321
1322 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1323 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1324 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1325 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1326 }
1327
1328 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1329 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1330 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1331 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1332 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1333 }
1334
1335 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1336 setOperationAction(ISD::ADD, VT, Custom);
1337 setOperationAction(ISD::SUB, VT, Custom);
1338 setOperationAction(ISD::MUL, VT, Custom);
1339 setOperationAction(ISD::SETCC, VT, Custom);
1340 setOperationAction(ISD::SELECT, VT, Custom);
1341 setOperationAction(ISD::TRUNCATE, VT, Custom);
1342 setOperationAction(ISD::UADDSAT, VT, Custom);
1343 setOperationAction(ISD::SADDSAT, VT, Custom);
1344 setOperationAction(ISD::USUBSAT, VT, Custom);
1345 setOperationAction(ISD::SSUBSAT, VT, Custom);
1346
1347 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1348 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1349 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1350 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1351 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1352 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1353 setOperationAction(ISD::VSELECT, VT, Expand);
1354 }
1355
1356 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1357 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1358 }
1359
1360 // This block controls legalization for 512-bit operations with 32/64 bit
1361 // elements. 512-bits can be disabled based on prefer-vector-width and
1362 // required-vector-width function attributes.
1363 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1364 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1365 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1366 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1367 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1368
1369 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1370 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1371 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1372 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1373 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1374 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1375 }
1376
1377 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1378 setOperationAction(ISD::FNEG, VT, Custom);
1379 setOperationAction(ISD::FABS, VT, Custom);
1380 setOperationAction(ISD::FMA, VT, Legal);
1381 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1382 }
1383
1384 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1385 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1386 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1387 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1388 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1389 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1390 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1391 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1392 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1393 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1394
1395 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);
1396
1397 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1398 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1399 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1400 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1401 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1402
1403 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1404 // to 512-bit rather than use the AVX2 instructions so that we can use
1405 // k-masks.
1406 if (!Subtarget.hasVLX()) {
1407 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1408 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1409 setOperationAction(ISD::MLOAD, VT, Custom);
1410 setOperationAction(ISD::MSTORE, VT, Custom);
1411 }
1412 }
1413
1414 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1415 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1416 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1417 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1418 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1419 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1420 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1421 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1422
1423 // Need to custom widen this if we don't have AVX512BW.
1424 setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
1425 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
1426 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
1427
1428 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1429 setOperationAction(ISD::FFLOOR, VT, Legal);
1430 setOperationAction(ISD::FCEIL, VT, Legal);
1431 setOperationAction(ISD::FTRUNC, VT, Legal);
1432 setOperationAction(ISD::FRINT, VT, Legal);
1433 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1434
1435 setOperationAction(ISD::SELECT, VT, Custom);
1436 }
1437
1438 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1439 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
1440 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1441 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1442 }
1443
1444 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1445 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1446 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1447 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1448
1449 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1450 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1451
1452 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1453 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1454
1455 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1456 setOperationAction(ISD::SMAX, VT, Legal);
1457 setOperationAction(ISD::UMAX, VT, Legal);
1458 setOperationAction(ISD::SMIN, VT, Legal);
1459 setOperationAction(ISD::UMIN, VT, Legal);
1460 setOperationAction(ISD::ABS, VT, Legal);
1461 setOperationAction(ISD::SRL, VT, Custom);
1462 setOperationAction(ISD::SHL, VT, Custom);
1463 setOperationAction(ISD::SRA, VT, Custom);
1464 setOperationAction(ISD::CTPOP, VT, Custom);
1465 setOperationAction(ISD::ROTL, VT, Custom);
1466 setOperationAction(ISD::ROTR, VT, Custom);
1467 setOperationAction(ISD::SETCC, VT, Custom);
1468 setOperationAction(ISD::SELECT, VT, Custom);
1469
1470 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1471 // setcc all the way to isel and prefer SETGT in some isel patterns.
1472 setCondCodeAction(ISD::SETLT, VT, Custom);
1473 setCondCodeAction(ISD::SETLE, VT, Custom);
1474 }
1475
1476 if (Subtarget.hasDQI()) {
1477 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1478 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1479 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1480 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1481
1482 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1483 }
1484
1485 if (Subtarget.hasCDI()) {
1486 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1487 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1488 setOperationAction(ISD::CTLZ, VT, Legal);
1489 }
1490 } // Subtarget.hasCDI()
1491
1492 if (Subtarget.hasVPOPCNTDQ()) {
1493 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1494 setOperationAction(ISD::CTPOP, VT, Legal);
1495 }
1496
1497 // Extract subvector is special because the value type
1498 // (result) is 256-bit but the source is 512-bit wide.
1499 // 128-bit was made Legal under AVX1.
1500 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1501 MVT::v8f32, MVT::v4f64 })
1502 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1503
1504 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1505 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1506 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1507 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1508 setOperationAction(ISD::VSELECT, VT, Custom);
1509 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1510 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1511 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1512 setOperationAction(ISD::MLOAD, VT, Legal);
1513 setOperationAction(ISD::MSTORE, VT, Legal);
1514 setOperationAction(ISD::MGATHER, VT, Custom);
1515 setOperationAction(ISD::MSCATTER, VT, Custom);
1516 }
1517 if (!Subtarget.hasBWI()) {
1518 // Need to custom split v32i16/v64i8 bitcasts.
1519 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1520 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1521
1522 // Better to split these into two 256-bit ops.
1523 setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
1524 setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
1525 }
1526
1527 if (Subtarget.hasVBMI2()) {
1528 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1529 setOperationAction(ISD::FSHL, VT, Custom);
1530 setOperationAction(ISD::FSHR, VT, Custom);
1531 }
1532 }
1533 }// has AVX-512
1534
1535 // This block controls legalization for operations that don't have
1536 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1537 // narrower widths.
1538 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1539 // These operations are handled on non-VLX by artificially widening in
1540 // isel patterns.
1541 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1542
1543 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1544 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1545 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1546 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1547 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1548
1549 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1550 setOperationAction(ISD::SMAX, VT, Legal);
1551 setOperationAction(ISD::UMAX, VT, Legal);
1552 setOperationAction(ISD::SMIN, VT, Legal);
1553 setOperationAction(ISD::UMIN, VT, Legal);
1554 setOperationAction(ISD::ABS, VT, Legal);
1555 }
1556
1557 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1558 setOperationAction(ISD::ROTL, VT, Custom);
1559 setOperationAction(ISD::ROTR, VT, Custom);
1560 }
1561
1562 // Custom legalize 2x32 to get a little better code.
1563 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1564 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1565
1566 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1567 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1568 setOperationAction(ISD::MSCATTER, VT, Custom);
1569
1570 if (Subtarget.hasDQI()) {
1571 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1572 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1573 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1574 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1575 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1576
1577 setOperationAction(ISD::MUL, VT, Legal);
1578 }
1579 }
1580
1581 if (Subtarget.hasCDI()) {
1582 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1583 setOperationAction(ISD::CTLZ, VT, Legal);
1584 }
1585 } // Subtarget.hasCDI()
1586
1587 if (Subtarget.hasVPOPCNTDQ()) {
1588 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1589 setOperationAction(ISD::CTPOP, VT, Legal);
1590 }
1591 }
1592
1593 // This block control legalization of v32i1/v64i1 which are available with
1594 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1595 // useBWIRegs.
1596 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1597 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1598 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1599
1600 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1601 setOperationAction(ISD::ADD, VT, Custom);
1602 setOperationAction(ISD::SUB, VT, Custom);
1603 setOperationAction(ISD::MUL, VT, Custom);
1604 setOperationAction(ISD::VSELECT, VT, Expand);
1605 setOperationAction(ISD::UADDSAT, VT, Custom);
1606 setOperationAction(ISD::SADDSAT, VT, Custom);
1607 setOperationAction(ISD::USUBSAT, VT, Custom);
1608 setOperationAction(ISD::SSUBSAT, VT, Custom);
1609
1610 setOperationAction(ISD::TRUNCATE, VT, Custom);
1611 setOperationAction(ISD::SETCC, VT, Custom);
1612 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1613 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1614 setOperationAction(ISD::SELECT, VT, Custom);
1615 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1616 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1617 }
1618
1619 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1620 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1621 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1622 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1623 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1624 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1625
1626 // Extends from v32i1 masks to 256-bit vectors.
1627 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1628 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1629 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1630 }
1631
1632 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1633 // disabled based on prefer-vector-width and required-vector-width function
1634 // attributes.
1635 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1636 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1637 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1638
1639 // Extends from v64i1 masks to 512-bit vectors.
1640 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1641 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1642 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1643
1644 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1645 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1646 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1647 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1648 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1649 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1650 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1651 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1652 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1653 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1654 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1655 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1656 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1657 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1658 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1659 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1660 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1661 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1662 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1663 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1664 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1665 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1666 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1667
1668 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1669 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1670
1671 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1672
1673 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1674 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1675 setOperationAction(ISD::VSELECT, VT, Custom);
1676 setOperationAction(ISD::ABS, VT, Legal);
1677 setOperationAction(ISD::SRL, VT, Custom);
1678 setOperationAction(ISD::SHL, VT, Custom);
1679 setOperationAction(ISD::SRA, VT, Custom);
1680 setOperationAction(ISD::MLOAD, VT, Legal);
1681 setOperationAction(ISD::MSTORE, VT, Legal);
1682 setOperationAction(ISD::CTPOP, VT, Custom);
1683 setOperationAction(ISD::CTLZ, VT, Custom);
1684 setOperationAction(ISD::SMAX, VT, Legal);
1685 setOperationAction(ISD::UMAX, VT, Legal);
1686 setOperationAction(ISD::SMIN, VT, Legal);
1687 setOperationAction(ISD::UMIN, VT, Legal);
1688 setOperationAction(ISD::SETCC, VT, Custom);
1689 setOperationAction(ISD::UADDSAT, VT, Legal);
1690 setOperationAction(ISD::SADDSAT, VT, Legal);
1691 setOperationAction(ISD::USUBSAT, VT, Legal);
1692 setOperationAction(ISD::SSUBSAT, VT, Legal);
1693 setOperationAction(ISD::SELECT, VT, Custom);
1694
1695 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1696 // setcc all the way to isel and prefer SETGT in some isel patterns.
1697 setCondCodeAction(ISD::SETLT, VT, Custom);
1698 setCondCodeAction(ISD::SETLE, VT, Custom);
1699 }
1700
1701 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1702 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1703 }
1704
1705 if (Subtarget.hasBITALG()) {
1706 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1707 setOperationAction(ISD::CTPOP, VT, Legal);
1708 }
1709
1710 if (Subtarget.hasVBMI2()) {
1711 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1712 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1713 }
1714 }
1715
1716 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1717 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1718 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1719 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1720 }
1721
1722 // These operations are handled on non-VLX by artificially widening in
1723 // isel patterns.
1724 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1725
1726 if (Subtarget.hasBITALG()) {
1727 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1728 setOperationAction(ISD::CTPOP, VT, Legal);
1729 }
1730 }
1731
1732 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1733 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1734 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1735 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1736 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1737 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1738
1739 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1740 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1741 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1742 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1743 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1744
1745 if (Subtarget.hasDQI()) {
1746 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1747 // v2f32 UINT_TO_FP is already custom under SSE2.
1748 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1749 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1750, __PRETTY_FUNCTION__))
1750 "Unexpected operation action!")((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1750, __PRETTY_FUNCTION__))
;
1751 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1752 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1753 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1754 }
1755
1756 if (Subtarget.hasBWI()) {
1757 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1758 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1759 }
1760
1761 if (Subtarget.hasVBMI2()) {
1762 // TODO: Make these legal even without VLX?
1763 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1764 MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1765 setOperationAction(ISD::FSHL, VT, Custom);
1766 setOperationAction(ISD::FSHR, VT, Custom);
1767 }
1768 }
1769
1770 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1771 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1772 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1773 }
1774
1775 // We want to custom lower some of our intrinsics.
1776 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1777 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1778 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1779 if (!Subtarget.is64Bit()) {
1780 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1781 }
1782
1783 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1784 // handle type legalization for these operations here.
1785 //
1786 // FIXME: We really should do custom legalization for addition and
1787 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1788 // than generic legalization for 64-bit multiplication-with-overflow, though.
1789 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1790 if (VT == MVT::i64 && !Subtarget.is64Bit())
1791 continue;
1792 // Add/Sub/Mul with overflow operations are custom lowered.
1793 setOperationAction(ISD::SADDO, VT, Custom);
1794 setOperationAction(ISD::UADDO, VT, Custom);
1795 setOperationAction(ISD::SSUBO, VT, Custom);
1796 setOperationAction(ISD::USUBO, VT, Custom);
1797 setOperationAction(ISD::SMULO, VT, Custom);
1798 setOperationAction(ISD::UMULO, VT, Custom);
1799
1800 // Support carry in as value rather than glue.
1801 setOperationAction(ISD::ADDCARRY, VT, Custom);
1802 setOperationAction(ISD::SUBCARRY, VT, Custom);
1803 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1804 }
1805
1806 if (!Subtarget.is64Bit()) {
1807 // These libcalls are not available in 32-bit.
1808 setLibcallName(RTLIB::SHL_I128, nullptr);
1809 setLibcallName(RTLIB::SRL_I128, nullptr);
1810 setLibcallName(RTLIB::SRA_I128, nullptr);
1811 setLibcallName(RTLIB::MUL_I128, nullptr);
1812 }
1813
1814 // Combine sin / cos into _sincos_stret if it is available.
1815 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1816 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1817 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1818 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1819 }
1820
1821 if (Subtarget.isTargetWin64()) {
1822 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1823 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1824 setOperationAction(ISD::SREM, MVT::i128, Custom);
1825 setOperationAction(ISD::UREM, MVT::i128, Custom);
1826 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1827 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1828 }
1829
1830 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1831 // is. We should promote the value to 64-bits to solve this.
1832 // This is what the CRT headers do - `fmodf` is an inline header
1833 // function casting to f64 and calling `fmod`.
1834 if (Subtarget.is32Bit() &&
1835 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1836 for (ISD::NodeType Op :
1837 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1838 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1839 if (isOperationExpand(Op, MVT::f32))
1840 setOperationAction(Op, MVT::f32, Promote);
1841
1842 // We have target-specific dag combine patterns for the following nodes:
1843 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1844 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1845 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1846 setTargetDAGCombine(ISD::CONCAT_VECTORS);
1847 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1848 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1849 setTargetDAGCombine(ISD::BITCAST);
1850 setTargetDAGCombine(ISD::VSELECT);
1851 setTargetDAGCombine(ISD::SELECT);
1852 setTargetDAGCombine(ISD::SHL);
1853 setTargetDAGCombine(ISD::SRA);
1854 setTargetDAGCombine(ISD::SRL);
1855 setTargetDAGCombine(ISD::OR);
1856 setTargetDAGCombine(ISD::AND);
1857 setTargetDAGCombine(ISD::ADD);
1858 setTargetDAGCombine(ISD::FADD);
1859 setTargetDAGCombine(ISD::FSUB);
1860 setTargetDAGCombine(ISD::FNEG);
1861 setTargetDAGCombine(ISD::FMA);
1862 setTargetDAGCombine(ISD::FMINNUM);
1863 setTargetDAGCombine(ISD::FMAXNUM);
1864 setTargetDAGCombine(ISD::SUB);
1865 setTargetDAGCombine(ISD::LOAD);
1866 setTargetDAGCombine(ISD::MLOAD);
1867 setTargetDAGCombine(ISD::STORE);
1868 setTargetDAGCombine(ISD::MSTORE);
1869 setTargetDAGCombine(ISD::TRUNCATE);
1870 setTargetDAGCombine(ISD::ZERO_EXTEND);
1871 setTargetDAGCombine(ISD::ANY_EXTEND);
1872 setTargetDAGCombine(ISD::SIGN_EXTEND);
1873 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1874 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
1875 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1876 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1877 setTargetDAGCombine(ISD::SINT_TO_FP);
1878 setTargetDAGCombine(ISD::UINT_TO_FP);
1879 setTargetDAGCombine(ISD::SETCC);
1880 setTargetDAGCombine(ISD::MUL);
1881 setTargetDAGCombine(ISD::XOR);
1882 setTargetDAGCombine(ISD::MSCATTER);
1883 setTargetDAGCombine(ISD::MGATHER);
1884
1885 computeRegisterProperties(Subtarget.getRegisterInfo());
1886
1887 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1888 MaxStoresPerMemsetOptSize = 8;
1889 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1890 MaxStoresPerMemcpyOptSize = 4;
1891 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1892 MaxStoresPerMemmoveOptSize = 4;
1893
1894 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1895 // that needs to benchmarked and balanced with the potential use of vector
1896 // load/store types (PR33329, PR33914).
1897 MaxLoadsPerMemcmp = 2;
1898 MaxLoadsPerMemcmpOptSize = 2;
1899
1900 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1901 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
1902
1903 // An out-of-order CPU can speculatively execute past a predictable branch,
1904 // but a conditional move could be stalled by an expensive earlier operation.
1905 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1906 EnableExtLdPromotion = true;
1907 setPrefFunctionAlignment(Align(16));
1908
1909 verifyIntrinsicTables();
1910}
1911
1912// This has so far only been implemented for 64-bit MachO.
1913bool X86TargetLowering::useLoadStackGuardNode() const {
1914 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1915}
1916
1917bool X86TargetLowering::useStackGuardXorFP() const {
1918 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1919 return Subtarget.getTargetTriple().isOSMSVCRT();
1920}
1921
1922SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1923 const SDLoc &DL) const {
1924 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1925 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1926 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1927 return SDValue(Node, 0);
1928}
1929
1930TargetLoweringBase::LegalizeTypeAction
1931X86TargetLowering::getPreferredVectorAction(MVT VT) const {
1932 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1933 return TypeSplitVector;
1934
1935 if (VT.getVectorNumElements() != 1 &&
1936 VT.getVectorElementType() != MVT::i1)
1937 return TypeWidenVector;
1938
1939 return TargetLoweringBase::getPreferredVectorAction(VT);
1940}
1941
1942MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1943 CallingConv::ID CC,
1944 EVT VT) const {
1945 // v32i1 vectors should be promoted to v32i8 to match avx2.
1946 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1947 return MVT::v32i8;
1948 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1949 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1950 Subtarget.hasAVX512() &&
1951 (!isPowerOf2_32(VT.getVectorNumElements()) ||
1952 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1953 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
1954 return MVT::i8;
1955 // FIXME: Should we just make these types legal and custom split operations?
1956 if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
1957 Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
1958 return MVT::v16i32;
1959 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1960}
1961
1962unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1963 CallingConv::ID CC,
1964 EVT VT) const {
1965 // v32i1 vectors should be promoted to v32i8 to match avx2.
1966 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1967 return 1;
1968 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1969 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1970 Subtarget.hasAVX512() &&
1971 (!isPowerOf2_32(VT.getVectorNumElements()) ||
1972 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1973 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
1974 return VT.getVectorNumElements();
1975 // FIXME: Should we just make these types legal and custom split operations?
1976 if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
1977 Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
1978 return 1;
1979 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1980}
1981
1982unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
1983 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1984 unsigned &NumIntermediates, MVT &RegisterVT) const {
1985 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1986 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1987 Subtarget.hasAVX512() &&
1988 (!isPowerOf2_32(VT.getVectorNumElements()) ||
1989 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1990 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
1991 RegisterVT = MVT::i8;
1992 IntermediateVT = MVT::i1;
1993 NumIntermediates = VT.getVectorNumElements();
1994 return NumIntermediates;
1995 }
1996
1997 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
1998 NumIntermediates, RegisterVT);
1999}
2000
2001EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2002 LLVMContext& Context,
2003 EVT VT) const {
2004 if (!VT.isVector())
2005 return MVT::i8;
2006
2007 if (Subtarget.hasAVX512()) {
2008 const unsigned NumElts = VT.getVectorNumElements();
2009
2010 // Figure out what this type will be legalized to.
2011 EVT LegalVT = VT;
2012 while (getTypeAction(Context, LegalVT) != TypeLegal)
2013 LegalVT = getTypeToTransformTo(Context, LegalVT);
2014
2015 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2016 if (LegalVT.getSimpleVT().is512BitVector())
2017 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2018
2019 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2020 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2021 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2022 // vXi16/vXi8.
2023 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2024 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2025 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2026 }
2027 }
2028
2029 return VT.changeVectorElementTypeToInteger();
2030}
2031
2032/// Helper for getByValTypeAlignment to determine
2033/// the desired ByVal argument alignment.
2034static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
2035 if (MaxAlign == 16)
2036 return;
2037 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2038 if (VTy->getBitWidth() == 128)
2039 MaxAlign = 16;
2040 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2041 unsigned EltAlign = 0;
2042 getMaxByValAlign(ATy->getElementType(), EltAlign);
2043 if (EltAlign > MaxAlign)
2044 MaxAlign = EltAlign;
2045 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2046 for (auto *EltTy : STy->elements()) {
2047 unsigned EltAlign = 0;
2048 getMaxByValAlign(EltTy, EltAlign);
2049 if (EltAlign > MaxAlign)
2050 MaxAlign = EltAlign;
2051 if (MaxAlign == 16)
2052 break;
2053 }
2054 }
2055}
2056
2057/// Return the desired alignment for ByVal aggregate
2058/// function arguments in the caller parameter area. For X86, aggregates
2059/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2060/// are at 4-byte boundaries.
2061unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2062 const DataLayout &DL) const {
2063 if (Subtarget.is64Bit()) {
2064 // Max of 8 and alignment of type.
2065 unsigned TyAlign = DL.getABITypeAlignment(Ty);
2066 if (TyAlign > 8)
2067 return TyAlign;
2068 return 8;
2069 }
2070
2071 unsigned Align = 4;
2072 if (Subtarget.hasSSE1())
2073 getMaxByValAlign(Ty, Align);
2074 return Align;
2075}
2076
2077/// Returns the target specific optimal type for load
2078/// and store operations as a result of memset, memcpy, and memmove
2079/// lowering. If DstAlign is zero that means it's safe to destination
2080/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
2081/// means there isn't a need to check it against alignment requirement,
2082/// probably because the source does not need to be loaded. If 'IsMemset' is
2083/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
2084/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
2085/// source is constant so it does not need to be loaded.
2086/// It returns EVT::Other if the type should be determined using generic
2087/// target-independent logic.
2088/// For vector ops we check that the overall size isn't larger than our
2089/// preferred vector width.
2090EVT X86TargetLowering::getOptimalMemOpType(
2091 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
2092 bool ZeroMemset, bool MemcpyStrSrc,
2093 const AttributeList &FuncAttributes) const {
2094 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2095 if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
2096 ((DstAlign == 0 || DstAlign >= 16) &&
2097 (SrcAlign == 0 || SrcAlign >= 16)))) {
2098 // FIXME: Check if unaligned 64-byte accesses are slow.
2099 if (Size >= 64 && Subtarget.hasAVX512() &&
2100 (Subtarget.getPreferVectorWidth() >= 512)) {
2101 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2102 }
2103 // FIXME: Check if unaligned 32-byte accesses are slow.
2104 if (Size >= 32 && Subtarget.hasAVX() &&
2105 (Subtarget.getPreferVectorWidth() >= 256)) {
2106 // Although this isn't a well-supported type for AVX1, we'll let
2107 // legalization and shuffle lowering produce the optimal codegen. If we
2108 // choose an optimal type with a vector element larger than a byte,
2109 // getMemsetStores() may create an intermediate splat (using an integer
2110 // multiply) before we splat as a vector.
2111 return MVT::v32i8;
2112 }
2113 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2114 return MVT::v16i8;
2115 // TODO: Can SSE1 handle a byte vector?
2116 // If we have SSE1 registers we should be able to use them.
2117 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2118 (Subtarget.getPreferVectorWidth() >= 128))
2119 return MVT::v4f32;
2120 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
2121 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2122 // Do not use f64 to lower memcpy if source is string constant. It's
2123 // better to use i32 to avoid the loads.
2124 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2125 // The gymnastics of splatting a byte value into an XMM register and then
2126 // only using 8-byte stores (because this is a CPU with slow unaligned
2127 // 16-byte accesses) makes that a loser.
2128 return MVT::f64;
2129 }
2130 }
2131 // This is a compromise. If we reach here, unaligned accesses may be slow on
2132 // this target. However, creating smaller, aligned accesses could be even
2133 // slower and would certainly be a lot more code.
2134 if (Subtarget.is64Bit() && Size >= 8)
2135 return MVT::i64;
2136 return MVT::i32;
2137}
2138
2139bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2140 if (VT == MVT::f32)
2141 return X86ScalarSSEf32;
2142 else if (VT == MVT::f64)
2143 return X86ScalarSSEf64;
2144 return true;
2145}
2146
2147bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2148 EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
2149 bool *Fast) const {
2150 if (Fast) {
2151 switch (VT.getSizeInBits()) {
2152 default:
2153 // 8-byte and under are always assumed to be fast.
2154 *Fast = true;
2155 break;
2156 case 128:
2157 *Fast = !Subtarget.isUnalignedMem16Slow();
2158 break;
2159 case 256:
2160 *Fast = !Subtarget.isUnalignedMem32Slow();
2161 break;
2162 // TODO: What about AVX-512 (512-bit) accesses?
2163 }
2164 }
2165 // NonTemporal vector memory ops must be aligned.
2166 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2167 // NT loads can only be vector aligned, so if its less aligned than the
2168 // minimum vector size (which we can split the vector down to), we might as
2169 // well use a regular unaligned vector load.
2170 // We don't have any NT loads pre-SSE41.
2171 if (!!(Flags & MachineMemOperand::MOLoad))
2172 return (Align < 16 || !Subtarget.hasSSE41());
2173 return false;
2174 }
2175 // Misaligned accesses of any size are always allowed.
2176 return true;
2177}
2178
2179/// Return the entry encoding for a jump table in the
2180/// current function. The returned value is a member of the
2181/// MachineJumpTableInfo::JTEntryKind enum.
2182unsigned X86TargetLowering::getJumpTableEncoding() const {
2183 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2184 // symbol.
2185 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2186 return MachineJumpTableInfo::EK_Custom32;
2187
2188 // Otherwise, use the normal jump table encoding heuristics.
2189 return TargetLowering::getJumpTableEncoding();
2190}
2191
2192bool X86TargetLowering::useSoftFloat() const {
2193 return Subtarget.useSoftFloat();
2194}
2195
2196void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2197 ArgListTy &Args) const {
2198
2199 // Only relabel X86-32 for C / Stdcall CCs.
2200 if (Subtarget.is64Bit())
2201 return;
2202 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2203 return;
2204 unsigned ParamRegs = 0;
2205 if (auto *M = MF->getFunction().getParent())
2206 ParamRegs = M->getNumberRegisterParameters();
2207
2208 // Mark the first N int arguments as having reg
2209 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2210 Type *T = Args[Idx].Ty;
2211 if (T->isIntOrPtrTy())
2212 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2213 unsigned numRegs = 1;
2214 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2215 numRegs = 2;
2216 if (ParamRegs < numRegs)
2217 return;
2218 ParamRegs -= numRegs;
2219 Args[Idx].IsInReg = true;
2220 }
2221 }
2222}
2223
2224const MCExpr *
2225X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2226 const MachineBasicBlock *MBB,
2227 unsigned uid,MCContext &Ctx) const{
2228 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT(
)) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2228, __PRETTY_FUNCTION__))
;
2229 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2230 // entries.
2231 return MCSymbolRefExpr::create(MBB->getSymbol(),
2232 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2233}
2234
2235/// Returns relocation base for the given PIC jumptable.
2236SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2237 SelectionDAG &DAG) const {
2238 if (!Subtarget.is64Bit())
2239 // This doesn't have SDLoc associated with it, but is not really the
2240 // same as a Register.
2241 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2242 getPointerTy(DAG.getDataLayout()));
2243 return Table;
2244}
2245
2246/// This returns the relocation base for the given PIC jumptable,
2247/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2248const MCExpr *X86TargetLowering::
2249getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2250 MCContext &Ctx) const {
2251 // X86-64 uses RIP relative addressing based on the jump table label.
2252 if (Subtarget.isPICStyleRIPRel())
2253 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2254
2255 // Otherwise, the reference is relative to the PIC base.
2256 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2257}
2258
2259std::pair<const TargetRegisterClass *, uint8_t>
2260X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2261 MVT VT) const {
2262 const TargetRegisterClass *RRC = nullptr;
2263 uint8_t Cost = 1;
2264 switch (VT.SimpleTy) {
2265 default:
2266 return TargetLowering::findRepresentativeClass(TRI, VT);
2267 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2268 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2269 break;
2270 case MVT::x86mmx:
2271 RRC = &X86::VR64RegClass;
2272 break;
2273 case MVT::f32: case MVT::f64:
2274 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2275 case MVT::v4f32: case MVT::v2f64:
2276 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2277 case MVT::v8f32: case MVT::v4f64:
2278 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2279 case MVT::v16f32: case MVT::v8f64:
2280 RRC = &X86::VR128XRegClass;
2281 break;
2282 }
2283 return std::make_pair(RRC, Cost);
2284}
2285
2286unsigned X86TargetLowering::getAddressSpace() const {
2287 if (Subtarget.is64Bit())
2288 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2289 return 256;
2290}
2291
2292static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2293 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2294 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2295}
2296
2297static Constant* SegmentOffset(IRBuilder<> &IRB,
2298 unsigned Offset, unsigned AddressSpace) {
2299 return ConstantExpr::getIntToPtr(
2300 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2301 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2302}
2303
2304Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2305 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2306 // tcbhead_t; use it instead of the usual global variable (see
2307 // sysdeps/{i386,x86_64}/nptl/tls.h)
2308 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2309 if (Subtarget.isTargetFuchsia()) {
2310 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2311 return SegmentOffset(IRB, 0x10, getAddressSpace());
2312 } else {
2313 // %fs:0x28, unless we're using a Kernel code model, in which case
2314 // it's %gs:0x28. gs:0x14 on i386.
2315 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2316 return SegmentOffset(IRB, Offset, getAddressSpace());
2317 }
2318 }
2319
2320 return TargetLowering::getIRStackGuard(IRB);
2321}
2322
2323void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2324 // MSVC CRT provides functionalities for stack protection.
2325 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2326 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2327 // MSVC CRT has a global variable holding security cookie.
2328 M.getOrInsertGlobal("__security_cookie",
2329 Type::getInt8PtrTy(M.getContext()));
2330
2331 // MSVC CRT has a function to validate security cookie.
2332 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2333 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2334 Type::getInt8PtrTy(M.getContext()));
2335 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2336 F->setCallingConv(CallingConv::X86_FastCall);
2337 F->addAttribute(1, Attribute::AttrKind::InReg);
2338 }
2339 return;
2340 }
2341 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2342 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2343 return;
2344 TargetLowering::insertSSPDeclarations(M);
2345}
2346
2347Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2348 // MSVC CRT has a global variable holding security cookie.
2349 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2350 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2351 return M.getGlobalVariable("__security_cookie");
2352 }
2353 return TargetLowering::getSDagStackGuard(M);
2354}
2355
2356Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2357 // MSVC CRT has a function to validate security cookie.
2358 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2359 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2360 return M.getFunction("__security_check_cookie");
2361 }
2362 return TargetLowering::getSSPStackGuardCheck(M);
2363}
2364
2365Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2366 if (Subtarget.getTargetTriple().isOSContiki())
2367 return getDefaultSafeStackPointerLocation(IRB, false);
2368
2369 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2370 // definition of TLS_SLOT_SAFESTACK in
2371 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2372 if (Subtarget.isTargetAndroid()) {
2373 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2374 // %gs:0x24 on i386
2375 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2376 return SegmentOffset(IRB, Offset, getAddressSpace());
2377 }
2378
2379 // Fuchsia is similar.
2380 if (Subtarget.isTargetFuchsia()) {
2381 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2382 return SegmentOffset(IRB, 0x18, getAddressSpace());
2383 }
2384
2385 return TargetLowering::getSafeStackPointerLocation(IRB);
2386}
2387
2388bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2389 unsigned DestAS) const {
2390 assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2390, __PRETTY_FUNCTION__))
;
2391
2392 return SrcAS < 256 && DestAS < 256;
2393}
2394
2395//===----------------------------------------------------------------------===//
2396// Return Value Calling Convention Implementation
2397//===----------------------------------------------------------------------===//
2398
2399bool X86TargetLowering::CanLowerReturn(
2400 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2401 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2402 SmallVector<CCValAssign, 16> RVLocs;
2403 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2404 return CCInfo.CheckReturn(Outs, RetCC_X86);
2405}
2406
2407const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2408 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2409 return ScratchRegs;
2410}
2411
2412/// Lowers masks values (v*i1) to the local register values
2413/// \returns DAG node after lowering to register type
2414static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2415 const SDLoc &Dl, SelectionDAG &DAG) {
2416 EVT ValVT = ValArg.getValueType();
2417
2418 if (ValVT == MVT::v1i1)
2419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2420 DAG.getIntPtrConstant(0, Dl));
2421
2422 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2423 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2424 // Two stage lowering might be required
2425 // bitcast: v8i1 -> i8 / v16i1 -> i16
2426 // anyextend: i8 -> i32 / i16 -> i32
2427 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2428 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2429 if (ValLoc == MVT::i32)
2430 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2431 return ValToCopy;
2432 }
2433
2434 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2435 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2436 // One stage lowering is required
2437 // bitcast: v32i1 -> i32 / v64i1 -> i64
2438 return DAG.getBitcast(ValLoc, ValArg);
2439 }
2440
2441 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2442}
2443
2444/// Breaks v64i1 value into two registers and adds the new node to the DAG
2445static void Passv64i1ArgInRegs(
2446 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2447 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
2448 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2449 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2449, __PRETTY_FUNCTION__))
;
2450 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2450, __PRETTY_FUNCTION__))
;
2451 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"
) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2451, __PRETTY_FUNCTION__))
;
2452 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2453, __PRETTY_FUNCTION__))
2453 "The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2453, __PRETTY_FUNCTION__))
;
2454
2455 // Before splitting the value we cast it to i64
2456 Arg = DAG.getBitcast(MVT::i64, Arg);
2457
2458 // Splitting the value into two i32 types
2459 SDValue Lo, Hi;
2460 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2461 DAG.getConstant(0, Dl, MVT::i32));
2462 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2463 DAG.getConstant(1, Dl, MVT::i32));
2464
2465 // Attach the two i32 types into corresponding registers
2466 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2467 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2468}
2469
2470SDValue
2471X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2472 bool isVarArg,
2473 const SmallVectorImpl<ISD::OutputArg> &Outs,
2474 const SmallVectorImpl<SDValue> &OutVals,
2475 const SDLoc &dl, SelectionDAG &DAG) const {
2476 MachineFunction &MF = DAG.getMachineFunction();
2477 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2478
2479 // In some cases we need to disable registers from the default CSR list.
2480 // For example, when they are used for argument passing.
2481 bool ShouldDisableCalleeSavedRegister =
2482 CallConv == CallingConv::X86_RegCall ||
2483 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2484
2485 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2486 report_fatal_error("X86 interrupts may not return any value");
2487
2488 SmallVector<CCValAssign, 16> RVLocs;
2489 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2490 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2491
2492 SDValue Flag;
2493 SmallVector<SDValue, 6> RetOps;
2494 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2495 // Operand #1 = Bytes To Pop
2496 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2497 MVT::i32));
2498
2499 // Copy the result values into the output registers.
2500 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2501 ++I, ++OutsIndex) {
2502 CCValAssign &VA = RVLocs[I];
2503 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2503, __PRETTY_FUNCTION__))
;
2504
2505 // Add the register to the CalleeSaveDisableRegs list.
2506 if (ShouldDisableCalleeSavedRegister)
2507 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2508
2509 SDValue ValToCopy = OutVals[OutsIndex];
2510 EVT ValVT = ValToCopy.getValueType();
2511
2512 // Promote values to the appropriate types.
2513 if (VA.getLocInfo() == CCValAssign::SExt)
2514 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2515 else if (VA.getLocInfo() == CCValAssign::ZExt)
2516 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2517 else if (VA.getLocInfo() == CCValAssign::AExt) {
2518 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2519 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2520 else
2521 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2522 }
2523 else if (VA.getLocInfo() == CCValAssign::BCvt)
2524 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2525
2526 assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2527, __PRETTY_FUNCTION__))
2527 "Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2527, __PRETTY_FUNCTION__))
;
2528
2529 // If this is x86-64, and we disabled SSE, we can't return FP values,
2530 // or SSE or MMX vectors.
2531 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2532 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2533 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2534 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2535 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2536 } else if (ValVT == MVT::f64 &&
2537 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2538 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2539 // llvm-gcc has never done it right and no one has noticed, so this
2540 // should be OK for now.
2541 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2542 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2543 }
2544
2545 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2546 // the RET instruction and handled by the FP Stackifier.
2547 if (VA.getLocReg() == X86::FP0 ||
2548 VA.getLocReg() == X86::FP1) {
2549 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2550 // change the value to the FP stack register class.
2551 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2552 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2553 RetOps.push_back(ValToCopy);
2554 // Don't emit a copytoreg.
2555 continue;
2556 }
2557
2558 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2559 // which is returned in RAX / RDX.
2560 if (Subtarget.is64Bit()) {
2561 if (ValVT == MVT::x86mmx) {
2562 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2563 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2564 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2565 ValToCopy);
2566 // If we don't have SSE2 available, convert to v4f32 so the generated
2567 // register is legal.
2568 if (!Subtarget.hasSSE2())
2569 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2570 }
2571 }
2572 }
2573
2574 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2575
2576 if (VA.needsCustom()) {
2577 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2578, __PRETTY_FUNCTION__))
2578 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2578, __PRETTY_FUNCTION__))
;
2579
2580 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
2581 Subtarget);
2582
2583 assert(2 == RegsToPass.size() &&((2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? static_cast<void> (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2584, __PRETTY_FUNCTION__))
2584 "Expecting two registers after Pass64BitArgInRegs")((2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"
) ? static_cast<void> (0) : __assert_fail ("2 == RegsToPass.size() && \"Expecting two registers after Pass64BitArgInRegs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2584, __PRETTY_FUNCTION__))
;
2585
2586 // Add the second register to the CalleeSaveDisableRegs list.
2587 if (ShouldDisableCalleeSavedRegister)
2588 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2589 } else {
2590 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2591 }
2592
2593 // Add nodes to the DAG and add the values into the RetOps list
2594 for (auto &Reg : RegsToPass) {
2595 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2596 Flag = Chain.getValue(1);
2597 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2598 }
2599 }
2600
2601 // Swift calling convention does not require we copy the sret argument
2602 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2603
2604 // All x86 ABIs require that for returning structs by value we copy
2605 // the sret argument into %rax/%eax (depending on ABI) for the return.
2606 // We saved the argument into a virtual register in the entry block,
2607 // so now we copy the value out and into %rax/%eax.
2608 //
2609 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2610 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2611 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2612 // either case FuncInfo->setSRetReturnReg() will have been called.
2613 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2614 // When we have both sret and another return value, we should use the
2615 // original Chain stored in RetOps[0], instead of the current Chain updated
2616 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2617
2618 // For the case of sret and another return value, we have
2619 // Chain_0 at the function entry
2620 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2621 // If we use Chain_1 in getCopyFromReg, we will have
2622 // Val = getCopyFromReg(Chain_1)
2623 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2624
2625 // getCopyToReg(Chain_0) will be glued together with
2626 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2627 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2628 // Data dependency from Unit B to Unit A due to usage of Val in
2629 // getCopyToReg(Chain_1, Val)
2630 // Chain dependency from Unit A to Unit B
2631
2632 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2633 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2634 getPointerTy(MF.getDataLayout()));
2635
2636 unsigned RetValReg
2637 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2638 X86::RAX : X86::EAX;
2639 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2640 Flag = Chain.getValue(1);
2641
2642 // RAX/EAX now acts like a return value.
2643 RetOps.push_back(
2644 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2645
2646 // Add the returned register to the CalleeSaveDisableRegs list.
2647 if (ShouldDisableCalleeSavedRegister)
2648 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2649 }
2650
2651 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2652 const MCPhysReg *I =
2653 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2654 if (I) {
2655 for (; *I; ++I) {
2656 if (X86::GR64RegClass.contains(*I))
2657 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2658 else
2659 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2659)
;
2660 }
2661 }
2662
2663 RetOps[0] = Chain; // Update chain.
2664
2665 // Add the flag if we have it.
2666 if (Flag.getNode())
2667 RetOps.push_back(Flag);
2668
2669 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2670 if (CallConv == CallingConv::X86_INTR)
2671 opcode = X86ISD::IRET;
2672 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2673}
2674
2675bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2676 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2677 return false;
2678
2679 SDValue TCChain = Chain;
2680 SDNode *Copy = *N->use_begin();
2681 if (Copy->getOpcode() == ISD::CopyToReg) {
2682 // If the copy has a glue operand, we conservatively assume it isn't safe to
2683 // perform a tail call.
2684 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2685 return false;
2686 TCChain = Copy->getOperand(0);
2687 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2688 return false;
2689
2690 bool HasRet = false;
2691 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2692 UI != UE; ++UI) {
2693 if (UI->getOpcode() != X86ISD::RET_FLAG)
2694 return false;
2695 // If we are returning more than one value, we can definitely
2696 // not make a tail call see PR19530
2697 if (UI->getNumOperands() > 4)
2698 return false;
2699 if (UI->getNumOperands() == 4 &&
2700 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2701 return false;
2702 HasRet = true;
2703 }
2704
2705 if (!HasRet)
2706 return false;
2707
2708 Chain = TCChain;
2709 return true;
2710}
2711
2712EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2713 ISD::NodeType ExtendKind) const {
2714 MVT ReturnMVT = MVT::i32;
2715
2716 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2717 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2718 // The ABI does not require i1, i8 or i16 to be extended.
2719 //
2720 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2721 // always extending i8/i16 return values, so keep doing that for now.
2722 // (PR26665).
2723 ReturnMVT = MVT::i8;
2724 }
2725
2726 EVT MinVT = getRegisterType(Context, ReturnMVT);
2727 return VT.bitsLT(MinVT) ? MinVT : VT;
2728}
2729
2730/// Reads two 32 bit registers and creates a 64 bit mask value.
2731/// \param VA The current 32 bit value that need to be assigned.
2732/// \param NextVA The next 32 bit value that need to be assigned.
2733/// \param Root The parent DAG node.
2734/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2735/// glue purposes. In the case the DAG is already using
2736/// physical register instead of virtual, we should glue
2737/// our new SDValue to InFlag SDvalue.
2738/// \return a new SDvalue of size 64bit.
2739static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2740 SDValue &Root, SelectionDAG &DAG,
2741 const SDLoc &Dl, const X86Subtarget &Subtarget,
2742 SDValue *InFlag = nullptr) {
2743 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2743, __PRETTY_FUNCTION__))
;
2744 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2744, __PRETTY_FUNCTION__))
;
2745 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2746, __PRETTY_FUNCTION__))
2746 "Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2746, __PRETTY_FUNCTION__))
;
2747 assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2748, __PRETTY_FUNCTION__))
2748 "The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2748, __PRETTY_FUNCTION__))
;
2749 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2750, __PRETTY_FUNCTION__))
2750 "The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2750, __PRETTY_FUNCTION__))
;
2751
2752 SDValue Lo, Hi;
2753 SDValue ArgValueLo, ArgValueHi;
2754
2755 MachineFunction &MF = DAG.getMachineFunction();
2756 const TargetRegisterClass *RC = &X86::GR32RegClass;
2757
2758 // Read a 32 bit value from the registers.
2759 if (nullptr == InFlag) {
2760 // When no physical register is present,
2761 // create an intermediate virtual register.
2762 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2763 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2764 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2765 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2766 } else {
2767 // When a physical register is available read the value from it and glue
2768 // the reads together.
2769 ArgValueLo =
2770 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2771 *InFlag = ArgValueLo.getValue(2);
2772 ArgValueHi =
2773 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2774 *InFlag = ArgValueHi.getValue(2);
2775 }
2776
2777 // Convert the i32 type into v32i1 type.
2778 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2779
2780 // Convert the i32 type into v32i1 type.
2781 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2782
2783 // Concatenate the two values together.
2784 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2785}
2786
2787/// The function will lower a register of various sizes (8/16/32/64)
2788/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2789/// \returns a DAG node contains the operand after lowering to mask type.
2790static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2791 const EVT &ValLoc, const SDLoc &Dl,
2792 SelectionDAG &DAG) {
2793 SDValue ValReturned = ValArg;
2794
2795 if (ValVT == MVT::v1i1)
2796 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2797
2798 if (ValVT == MVT::v64i1) {
2799 // In 32 bit machine, this case is handled by getv64i1Argument
2800 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2800, __PRETTY_FUNCTION__))
;
2801 // In 64 bit machine, There is no need to truncate the value only bitcast
2802 } else {
2803 MVT maskLen;
2804 switch (ValVT.getSimpleVT().SimpleTy) {
2805 case MVT::v8i1:
2806 maskLen = MVT::i8;
2807 break;
2808 case MVT::v16i1:
2809 maskLen = MVT::i16;
2810 break;
2811 case MVT::v32i1:
2812 maskLen = MVT::i32;
2813 break;
2814 default:
2815 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2815)
;
2816 }
2817
2818 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2819 }
2820 return DAG.getBitcast(ValVT, ValReturned);
2821}
2822
2823/// Lower the result values of a call into the
2824/// appropriate copies out of appropriate physical registers.
2825///
2826SDValue X86TargetLowering::LowerCallResult(
2827 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2828 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2829 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2830 uint32_t *RegMask) const {
2831
2832 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2833 // Assign locations to each value returned by this call.
2834 SmallVector<CCValAssign, 16> RVLocs;
2835 bool Is64Bit = Subtarget.is64Bit();
2836 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2837 *DAG.getContext());
2838 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2839
2840 // Copy all of the result registers out of their specified physreg.
2841 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2842 ++I, ++InsIndex) {
2843 CCValAssign &VA = RVLocs[I];
2844 EVT CopyVT = VA.getLocVT();
2845
2846 // In some calling conventions we need to remove the used registers
2847 // from the register mask.
2848 if (RegMask) {
2849 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2850 SubRegs.isValid(); ++SubRegs)
2851 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2852 }
2853
2854 // If this is x86-64, and we disabled SSE, we can't return FP values
2855 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2856 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2857 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2858 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2859 } else if (CopyVT == MVT::f64 &&
2860 (Is64Bit && !Subtarget.hasSSE2())) {
2861 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2862 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2863 }
2864
2865 // If we prefer to use the value in xmm registers, copy it out as f80 and
2866 // use a truncate to move it from fp stack reg to xmm reg.
2867 bool RoundAfterCopy = false;
2868 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2869 isScalarFPTypeInSSEReg(VA.getValVT())) {
2870 if (!Subtarget.hasX87())
2871 report_fatal_error("X87 register return with X87 disabled");
2872 CopyVT = MVT::f80;
2873 RoundAfterCopy = (CopyVT != VA.getLocVT());
2874 }
2875
2876 SDValue Val;
2877 if (VA.needsCustom()) {
2878 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2879, __PRETTY_FUNCTION__))
2879 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2879, __PRETTY_FUNCTION__))
;
2880 Val =
2881 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2882 } else {
2883 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2884 .getValue(1);
2885 Val = Chain.getValue(0);
2886 InFlag = Chain.getValue(2);
2887 }
2888
2889 if (RoundAfterCopy)
2890 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2891 // This truncation won't change the value.
2892 DAG.getIntPtrConstant(1, dl));
2893
2894 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2895 if (VA.getValVT().isVector() &&
2896 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2897 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2898 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2899 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2900 } else
2901 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2902 }
2903
2904 InVals.push_back(Val);
2905 }
2906
2907 return Chain;
2908}
2909
2910//===----------------------------------------------------------------------===//
2911// C & StdCall & Fast Calling Convention implementation
2912//===----------------------------------------------------------------------===//
2913// StdCall calling convention seems to be standard for many Windows' API
2914// routines and around. It differs from C calling convention just a little:
2915// callee should clean up the stack, not caller. Symbols should be also
2916// decorated in some fancy way :) It doesn't support any vector arguments.
2917// For info on fast calling convention see Fast Calling Convention (tail call)
2918// implementation LowerX86_32FastCCCallTo.
2919
2920/// CallIsStructReturn - Determines whether a call uses struct return
2921/// semantics.
2922enum StructReturnType {
2923 NotStructReturn,
2924 RegStructReturn,
2925 StackStructReturn
2926};
2927static StructReturnType
2928callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2929 if (Outs.empty())
2930 return NotStructReturn;
2931
2932 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2933 if (!Flags.isSRet())
2934 return NotStructReturn;
2935 if (Flags.isInReg() || IsMCU)
2936 return RegStructReturn;
2937 return StackStructReturn;
2938}
2939
2940/// Determines whether a function uses struct return semantics.
2941static StructReturnType
2942argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2943 if (Ins.empty())
2944 return NotStructReturn;
2945
2946 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2947 if (!Flags.isSRet())
2948 return NotStructReturn;
2949 if (Flags.isInReg() || IsMCU)
2950 return RegStructReturn;
2951 return StackStructReturn;
2952}
2953
2954/// Make a copy of an aggregate at address specified by "Src" to address
2955/// "Dst" with size and alignment information specified by the specific
2956/// parameter attribute. The copy will be passed as a byval function parameter.
2957static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2958 SDValue Chain, ISD::ArgFlagsTy Flags,
2959 SelectionDAG &DAG, const SDLoc &dl) {
2960 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2961
2962 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2963 /*isVolatile*/false, /*AlwaysInline=*/true,
2964 /*isTailCall*/false,
2965 MachinePointerInfo(), MachinePointerInfo());
2966}
2967
2968/// Return true if the calling convention is one that we can guarantee TCO for.
2969static bool canGuaranteeTCO(CallingConv::ID CC) {
2970 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2971 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2972 CC == CallingConv::HHVM || CC == CallingConv::Tail);
2973}
2974
2975/// Return true if we might ever do TCO for calls with this calling convention.
2976static bool mayTailCallThisCC(CallingConv::ID CC) {
2977 switch (CC) {
2978 // C calling conventions:
2979 case CallingConv::C:
2980 case CallingConv::Win64:
2981 case CallingConv::X86_64_SysV:
2982 // Callee pop conventions:
2983 case CallingConv::X86_ThisCall:
2984 case CallingConv::X86_StdCall:
2985 case CallingConv::X86_VectorCall:
2986 case CallingConv::X86_FastCall:
2987 // Swift:
2988 case CallingConv::Swift:
2989 return true;
2990 default:
2991 return canGuaranteeTCO(CC);
2992 }
2993}
2994
2995/// Return true if the function is being made into a tailcall target by
2996/// changing its ABI.
2997static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2998 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
2999}
3000
3001bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3002 auto Attr =
3003 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
3004 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
3005 return false;
3006
3007 ImmutableCallSite CS(CI);
3008 CallingConv::ID CalleeCC = CS.getCallingConv();
3009 if (!mayTailCallThisCC(CalleeCC))
3010 return false;
3011
3012 return true;
3013}
3014
3015SDValue
3016X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3017 const SmallVectorImpl<ISD::InputArg> &Ins,
3018 const SDLoc &dl, SelectionDAG &DAG,
3019 const CCValAssign &VA,
3020 MachineFrameInfo &MFI, unsigned i) const {
3021 // Create the nodes corresponding to a load from this parameter slot.
3022 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3023 bool AlwaysUseMutable = shouldGuaranteeTCO(
3024 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3025 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3026 EVT ValVT;
3027 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3028
3029 // If value is passed by pointer we have address passed instead of the value
3030 // itself. No need to extend if the mask value and location share the same
3031 // absolute size.
3032 bool ExtendedInMem =
3033 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3034 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3035
3036 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3037 ValVT = VA.getLocVT();
3038 else
3039 ValVT = VA.getValVT();
3040
3041 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3042 // changed with more analysis.
3043 // In case of tail call optimization mark all arguments mutable. Since they
3044 // could be overwritten by lowering of arguments in case of a tail call.
3045 if (Flags.isByVal()) {
3046 unsigned Bytes = Flags.getByValSize();
3047 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3048
3049 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3050 // can be improved with deeper analysis.
3051 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3052 /*isAliased=*/true);
3053 return DAG.getFrameIndex(FI, PtrVT);
3054 }
3055
3056 // This is an argument in memory. We might be able to perform copy elision.
3057 // If the argument is passed directly in memory without any extension, then we
3058 // can perform copy elision. Large vector types, for example, may be passed
3059 // indirectly by pointer.
3060 if (Flags.isCopyElisionCandidate() &&
3061 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
3062 EVT ArgVT = Ins[i].ArgVT;
3063 SDValue PartAddr;
3064 if (Ins[i].PartOffset == 0) {
3065 // If this is a one-part value or the first part of a multi-part value,
3066 // create a stack object for the entire argument value type and return a
3067 // load from our portion of it. This assumes that if the first part of an
3068 // argument is in memory, the rest will also be in memory.
3069 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3070 /*IsImmutable=*/false);
3071 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3072 return DAG.getLoad(
3073 ValVT, dl, Chain, PartAddr,
3074 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3075 } else {
3076 // This is not the first piece of an argument in memory. See if there is
3077 // already a fixed stack object including this offset. If so, assume it
3078 // was created by the PartOffset == 0 branch above and create a load from
3079 // the appropriate offset into it.
3080 int64_t PartBegin = VA.getLocMemOffset();
3081 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3082 int FI = MFI.getObjectIndexBegin();
3083 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3084 int64_t ObjBegin = MFI.getObjectOffset(FI);
3085 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3086 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3087 break;
3088 }
3089 if (MFI.isFixedObjectIndex(FI)) {
3090 SDValue Addr =
3091 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3092 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3093 return DAG.getLoad(
3094 ValVT, dl, Chain, Addr,
3095 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3096 Ins[i].PartOffset));
3097 }
3098 }
3099 }
3100
3101 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3102 VA.getLocMemOffset(), isImmutable);
3103
3104 // Set SExt or ZExt flag.
3105 if (VA.getLocInfo() == CCValAssign::ZExt) {
3106 MFI.setObjectZExt(FI, true);
3107 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3108 MFI.setObjectSExt(FI, true);
3109 }
3110
3111 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3112 SDValue Val = DAG.getLoad(
3113 ValVT, dl, Chain, FIN,
3114 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3115 return ExtendedInMem
3116 ? (VA.getValVT().isVector()
3117 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3118 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3119 : Val;
3120}
3121
3122// FIXME: Get this from tablegen.
3123static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3124 const X86Subtarget &Subtarget) {
3125 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3125, __PRETTY_FUNCTION__))
;
3126
3127 if (Subtarget.isCallingConvWin64(CallConv)) {
3128 static const MCPhysReg GPR64ArgRegsWin64[] = {
3129 X86::RCX, X86::RDX, X86::R8, X86::R9
3130 };
3131 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3132 }
3133
3134 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3135 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3136 };
3137 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3138}
3139
3140// FIXME: Get this from tablegen.
3141static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3142 CallingConv::ID CallConv,
3143 const X86Subtarget &Subtarget) {
3144 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3144, __PRETTY_FUNCTION__))
;
3145 if (Subtarget.isCallingConvWin64(CallConv)) {
3146 // The XMM registers which might contain var arg parameters are shadowed
3147 // in their paired GPR. So we only need to save the GPR to their home
3148 // slots.
3149 // TODO: __vectorcall will change this.
3150 return None;
3151 }
3152
3153 const Function &F = MF.getFunction();
3154 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3155 bool isSoftFloat = Subtarget.useSoftFloat();
3156 assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3157, __PRETTY_FUNCTION__))
3157 "SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3157, __PRETTY_FUNCTION__))
;
3158 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3159 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3160 // registers.
3161 return None;
3162
3163 static const MCPhysReg XMMArgRegs64Bit[] = {
3164 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3165 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3166 };
3167 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3168}
3169
3170#ifndef NDEBUG
3171static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3172 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
3173 [](const CCValAssign &A, const CCValAssign &B) -> bool {
3174 return A.getValNo() < B.getValNo();
3175 });
3176}
3177#endif
3178
3179SDValue X86TargetLowering::LowerFormalArguments(
3180 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3181 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3182 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3183 MachineFunction &MF = DAG.getMachineFunction();
3184 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3185 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3186
3187 const Function &F = MF.getFunction();
3188 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3189 F.getName() == "main")
3190 FuncInfo->setForceFramePointer(true);
3191
3192 MachineFrameInfo &MFI = MF.getFrameInfo();
3193 bool Is64Bit = Subtarget.is64Bit();
3194 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3195
3196 assert(((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3198, __PRETTY_FUNCTION__))
3197 !(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3198, __PRETTY_FUNCTION__))
3198 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3198, __PRETTY_FUNCTION__))
;
3199
3200 // Assign locations to all of the incoming arguments.
3201 SmallVector<CCValAssign, 16> ArgLocs;
3202 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3203
3204 // Allocate shadow area for Win64.
3205 if (IsWin64)
3206 CCInfo.AllocateStack(32, 8);
3207
3208 CCInfo.AnalyzeArguments(Ins, CC_X86);
3209
3210 // In vectorcall calling convention a second pass is required for the HVA
3211 // types.
3212 if (CallingConv::X86_VectorCall == CallConv) {
3213 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3214 }
3215
3216 // The next loop assumes that the locations are in the same order of the
3217 // input arguments.
3218 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3219, __PRETTY_FUNCTION__))
3219 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3219, __PRETTY_FUNCTION__))
;
3220
3221 SDValue ArgValue;
3222 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3223 ++I, ++InsIndex) {
3224 assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast
<void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3224, __PRETTY_FUNCTION__))
;
3225 CCValAssign &VA = ArgLocs[I];
3226
3227 if (VA.isRegLoc()) {
3228 EVT RegVT = VA.getLocVT();
3229 if (VA.needsCustom()) {
3230 assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3232, __PRETTY_FUNCTION__))
3231 VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3232, __PRETTY_FUNCTION__))
3232 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3232, __PRETTY_FUNCTION__))
;
3233
3234 // v64i1 values, in regcall calling convention, that are
3235 // compiled to 32 bit arch, are split up into two registers.
3236 ArgValue =
3237 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3238 } else {
3239 const TargetRegisterClass *RC;
3240 if (RegVT == MVT::i8)
3241 RC = &X86::GR8RegClass;
3242 else if (RegVT == MVT::i16)
3243 RC = &X86::GR16RegClass;
3244 else if (RegVT == MVT::i32)
3245 RC = &X86::GR32RegClass;
3246 else if (Is64Bit && RegVT == MVT::i64)
3247 RC = &X86::GR64RegClass;
3248 else if (RegVT == MVT::f32)
3249 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3250 else if (RegVT == MVT::f64)
3251 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3252 else if (RegVT == MVT::f80)
3253 RC = &X86::RFP80RegClass;
3254 else if (RegVT == MVT::f128)
3255 RC = &X86::VR128RegClass;
3256 else if (RegVT.is512BitVector())
3257 RC = &X86::VR512RegClass;
3258 else if (RegVT.is256BitVector())
3259 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3260 else if (RegVT.is128BitVector())
3261 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3262 else if (RegVT == MVT::x86mmx)
3263 RC = &X86::VR64RegClass;
3264 else if (RegVT == MVT::v1i1)
3265 RC = &X86::VK1RegClass;
3266 else if (RegVT == MVT::v8i1)
3267 RC = &X86::VK8RegClass;
3268 else if (RegVT == MVT::v16i1)
3269 RC = &X86::VK16RegClass;
3270 else if (RegVT == MVT::v32i1)
3271 RC = &X86::VK32RegClass;
3272 else if (RegVT == MVT::v64i1)
3273 RC = &X86::VK64RegClass;
3274 else
3275 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3275)
;
3276
3277 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3278 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3279 }
3280
3281 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3282 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3283 // right size.
3284 if (VA.getLocInfo() == CCValAssign::SExt)
3285 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3286 DAG.getValueType(VA.getValVT()));
3287 else if (VA.getLocInfo() == CCValAssign::ZExt)
3288 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3289 DAG.getValueType(VA.getValVT()));
3290 else if (VA.getLocInfo() == CCValAssign::BCvt)
3291 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3292
3293 if (VA.isExtInLoc()) {
3294 // Handle MMX values passed in XMM regs.
3295 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3296 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3297 else if (VA.getValVT().isVector() &&
3298 VA.getValVT().getScalarType() == MVT::i1 &&
3299 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3300 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3301 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3302 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3303 } else
3304 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3305 }
3306 } else {
3307 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3307, __PRETTY_FUNCTION__))
;
3308 ArgValue =
3309 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3310 }
3311
3312 // If value is passed via pointer - do a load.
3313 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3314 ArgValue =
3315 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3316
3317 InVals.push_back(ArgValue);
3318 }
3319
3320 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3321 // Swift calling convention does not require we copy the sret argument
3322 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3323 if (CallConv == CallingConv::Swift)
3324 continue;
3325
3326 // All x86 ABIs require that for returning structs by value we copy the
3327 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3328 // the argument into a virtual register so that we can access it from the
3329 // return points.
3330 if (Ins[I].Flags.isSRet()) {
3331 unsigned Reg = FuncInfo->getSRetReturnReg();
3332 if (!Reg) {
3333 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3334 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3335 FuncInfo->setSRetReturnReg(Reg);
3336 }
3337 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3338 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3339 break;
3340 }
3341 }
3342
3343 unsigned StackSize = CCInfo.getNextStackOffset();
3344 // Align stack specially for tail calls.
3345 if (shouldGuaranteeTCO(CallConv,
3346 MF.getTarget().Options.GuaranteedTailCallOpt))
3347 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3348
3349 // If the function takes variable number of arguments, make a frame index for
3350 // the start of the first vararg value... for expansion of llvm.va_start. We
3351 // can skip this if there are no va_start calls.
3352 if (MFI.hasVAStart() &&
3353 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3354 CallConv != CallingConv::X86_ThisCall))) {
3355 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3356 }
3357
3358 // Figure out if XMM registers are in use.
3359 assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3361, __PRETTY_FUNCTION__))
3360 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3361, __PRETTY_FUNCTION__))
3361 "SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3361, __PRETTY_FUNCTION__))
;
3362
3363 // 64-bit calling conventions support varargs and register parameters, so we
3364 // have to do extra work to spill them in the prologue.
3365 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3366 // Find the first unallocated argument registers.
3367 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3368 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3369 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3370 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3371 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3372, __PRETTY_FUNCTION__))
3372 "SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3372, __PRETTY_FUNCTION__))
;
3373
3374 // Gather all the live in physical registers.
3375 SmallVector<SDValue, 6> LiveGPRs;
3376 SmallVector<SDValue, 8> LiveXMMRegs;
3377 SDValue ALVal;
3378 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3379 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3380 LiveGPRs.push_back(
3381 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3382 }
3383 if (!ArgXMMs.empty()) {
3384 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3385 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3386 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3387 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3388 LiveXMMRegs.push_back(
3389 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3390 }
3391 }
3392
3393 if (IsWin64) {
3394 // Get to the caller-allocated home save location. Add 8 to account
3395 // for the return address.
3396 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3397 FuncInfo->setRegSaveFrameIndex(
3398 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3399 // Fixup to set vararg frame on shadow area (4 x i64).
3400 if (NumIntRegs < 4)
3401 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3402 } else {
3403 // For X86-64, if there are vararg parameters that are passed via
3404 // registers, then we must store them to their spots on the stack so
3405 // they may be loaded by dereferencing the result of va_next.
3406 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3407 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3408 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3409 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3410 }
3411
3412 // Store the integer parameter registers.
3413 SmallVector<SDValue, 8> MemOps;
3414 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3415 getPointerTy(DAG.getDataLayout()));
3416 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3417 for (SDValue Val : LiveGPRs) {
3418 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3419 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3420 SDValue Store =
3421 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3422 MachinePointerInfo::getFixedStack(
3423 DAG.getMachineFunction(),
3424 FuncInfo->getRegSaveFrameIndex(), Offset));
3425 MemOps.push_back(Store);
3426 Offset += 8;
3427 }
3428
3429 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3430 // Now store the XMM (fp + vector) parameter registers.
3431 SmallVector<SDValue, 12> SaveXMMOps;
3432 SaveXMMOps.push_back(Chain);
3433 SaveXMMOps.push_back(ALVal);
3434 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3435 FuncInfo->getRegSaveFrameIndex(), dl));
3436 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3437 FuncInfo->getVarArgsFPOffset(), dl));
3438 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3439 LiveXMMRegs.end());
3440 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3441 MVT::Other, SaveXMMOps));
3442 }
3443
3444 if (!MemOps.empty())
3445 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3446 }
3447
3448 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3449 // Find the largest legal vector type.
3450 MVT VecVT = MVT::Other;
3451 // FIXME: Only some x86_32 calling conventions support AVX512.
3452 if (Subtarget.useAVX512Regs() &&
3453 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3454 CallConv == CallingConv::Intel_OCL_BI)))
3455 VecVT = MVT::v16f32;
3456 else if (Subtarget.hasAVX())
3457 VecVT = MVT::v8f32;
3458 else if (Subtarget.hasSSE2())
3459 VecVT = MVT::v4f32;
3460
3461 // We forward some GPRs and some vector types.
3462 SmallVector<MVT, 2> RegParmTypes;
3463 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3464 RegParmTypes.push_back(IntVT);
3465 if (VecVT != MVT::Other)
3466 RegParmTypes.push_back(VecVT);
3467
3468 // Compute the set of forwarded registers. The rest are scratch.
3469 SmallVectorImpl<ForwardedRegister> &Forwards =
3470 FuncInfo->getForwardedMustTailRegParms();
3471 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3472
3473 // Conservatively forward AL on x86_64, since it might be used for varargs.
3474 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3475 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3476 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3477 }
3478
3479 // Copy all forwards from physical to virtual registers.
3480 for (ForwardedRegister &FR : Forwards) {
3481 // FIXME: Can we use a less constrained schedule?
3482 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
3483 FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
3484 Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
3485 }
3486 }
3487
3488 // Some CCs need callee pop.
3489 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3490 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3491 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3492 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3493 // X86 interrupts must pop the error code (and the alignment padding) if
3494 // present.
3495 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3496 } else {
3497 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3498 // If this is an sret function, the return should pop the hidden pointer.
3499 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3500 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3501 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3502 FuncInfo->setBytesToPopOnReturn(4);
3503 }
3504
3505 if (!Is64Bit) {
3506 // RegSaveFrameIndex is X86-64 only.
3507 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3508 if (CallConv == CallingConv::X86_FastCall ||
3509 CallConv == CallingConv::X86_ThisCall)
3510 // fastcc functions can't have varargs.
3511 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3512 }
3513
3514 FuncInfo->setArgumentStackSize(StackSize);
3515
3516 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3517 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3518 if (Personality == EHPersonality::CoreCLR) {
3519 assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3519, __PRETTY_FUNCTION__))
;
3520 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3521 // that we'd prefer this slot be allocated towards the bottom of the frame
3522 // (i.e. near the stack pointer after allocating the frame). Every
3523 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3524 // offset from the bottom of this and each funclet's frame must be the
3525 // same, so the size of funclets' (mostly empty) frames is dictated by
3526 // how far this slot is from the bottom (since they allocate just enough
3527 // space to accommodate holding this slot at the correct offset).
3528 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3529 EHInfo->PSPSymFrameIdx = PSPSymFI;
3530 }
3531 }
3532
3533 if (CallConv == CallingConv::X86_RegCall ||
3534 F.hasFnAttribute("no_caller_saved_registers")) {
3535 MachineRegisterInfo &MRI = MF.getRegInfo();
3536 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3537 MRI.disableCalleeSavedRegister(Pair.first);
3538 }
3539
3540 return Chain;
3541}
3542
3543SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3544 SDValue Arg, const SDLoc &dl,
3545 SelectionDAG &DAG,
3546 const CCValAssign &VA,
3547 ISD::ArgFlagsTy Flags) const {
3548 unsigned LocMemOffset = VA.getLocMemOffset();
3549 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3550 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3551 StackPtr, PtrOff);
3552 if (Flags.isByVal())
3553 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3554
3555 return DAG.getStore(
3556 Chain, dl, Arg, PtrOff,
3557 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3558}
3559
3560/// Emit a load of return address if tail call
3561/// optimization is performed and it is required.
3562SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3563 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3564 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3565 // Adjust the Return address stack slot.
3566 EVT VT = getPointerTy(DAG.getDataLayout());
3567 OutRetAddr = getReturnAddressFrameIndex(DAG);
3568
3569 // Load the "old" Return address.
3570 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3571 return SDValue(OutRetAddr.getNode(), 1);
3572}
3573
3574/// Emit a store of the return address if tail call
3575/// optimization is performed and it is required (FPDiff!=0).
3576static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3577 SDValue Chain, SDValue RetAddrFrIdx,
3578 EVT PtrVT, unsigned SlotSize,
3579 int FPDiff, const SDLoc &dl) {
3580 // Store the return address to the appropriate stack slot.
3581 if (!FPDiff) return Chain;
3582 // Calculate the new stack slot for the return address.
3583 int NewReturnAddrFI =
3584 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3585 false);
3586 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3587 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3588 MachinePointerInfo::getFixedStack(
3589 DAG.getMachineFunction(), NewReturnAddrFI));
3590 return Chain;
3591}
3592
3593/// Returns a vector_shuffle mask for an movs{s|d}, movd
3594/// operation of specified width.
3595static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3596 SDValue V2) {
3597 unsigned NumElems = VT.getVectorNumElements();
3598 SmallVector<int, 8> Mask;
3599 Mask.push_back(NumElems);
3600 for (unsigned i = 1; i != NumElems; ++i)
3601 Mask.push_back(i);
3602 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3603}
3604
3605SDValue
3606X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3607 SmallVectorImpl<SDValue> &InVals) const {
3608 SelectionDAG &DAG = CLI.DAG;
3609 SDLoc &dl = CLI.DL;
3610 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3611 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3612 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3613 SDValue Chain = CLI.Chain;
3614 SDValue Callee = CLI.Callee;
3615 CallingConv::ID CallConv = CLI.CallConv;
3616 bool &isTailCall = CLI.IsTailCall;
3617 bool isVarArg = CLI.IsVarArg;
3618
3619 MachineFunction &MF = DAG.getMachineFunction();
3620 bool Is64Bit = Subtarget.is64Bit();
3621 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3622 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3623 bool IsSibcall = false;
3624 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3625 CallConv == CallingConv::Tail;
3626 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3627 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3628 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3629 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3630 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3631 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3632 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3633 bool HasNoCfCheck =
3634 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3635 const Module *M = MF.getMMI().getModule();
3636 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3637
3638 MachineFunction::CallSiteInfo CSInfo;
3639
3640 if (CallConv == CallingConv::X86_INTR)
3641 report_fatal_error("X86 interrupts may not be called directly");
3642
3643 if (Attr.getValueAsString() == "true")
3644 isTailCall = false;
3645
3646 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3647 // If we are using a GOT, disable tail calls to external symbols with
3648 // default visibility. Tail calling such a symbol requires using a GOT
3649 // relocation, which forces early binding of the symbol. This breaks code
3650 // that require lazy function symbol resolution. Using musttail or
3651 // GuaranteedTailCallOpt will override this.
3652 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3653 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3654 G->getGlobal()->hasDefaultVisibility()))
3655 isTailCall = false;
3656 }
3657
3658 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3659 if (IsMustTail) {
3660 // Force this to be a tail call. The verifier rules are enough to ensure
3661 // that we can lower this successfully without moving the return address
3662 // around.
3663 isTailCall = true;
3664 } else if (isTailCall) {
3665 // Check if it's really possible to do a tail call.
3666 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3667 isVarArg, SR != NotStructReturn,
3668 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3669 Outs, OutVals, Ins, DAG);
3670
3671 // Sibcalls are automatically detected tailcalls which do not require
3672 // ABI changes.
3673 if (!IsGuaranteeTCO && isTailCall)
3674 IsSibcall = true;
3675
3676 if (isTailCall)
3677 ++NumTailCalls;
3678 }
3679
3680 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3681, __PRETTY_FUNCTION__))
3681 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3681, __PRETTY_FUNCTION__))
;
3682
3683 // Analyze operands of the call, assigning locations to each operand.
3684 SmallVector<CCValAssign, 16> ArgLocs;
3685 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3686
3687 // Allocate shadow area for Win64.
3688 if (IsWin64)
3689 CCInfo.AllocateStack(32, 8);
3690
3691 CCInfo.AnalyzeArguments(Outs, CC_X86);
3692
3693 // In vectorcall calling convention a second pass is required for the HVA
3694 // types.
3695 if (CallingConv::X86_VectorCall == CallConv) {
3696 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3697 }
3698
3699 // Get a count of how many bytes are to be pushed on the stack.
3700 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3701 if (IsSibcall)
3702 // This is a sibcall. The memory operands are available in caller's
3703 // own caller's stack.
3704 NumBytes = 0;
3705 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
3706 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3707
3708 int FPDiff = 0;
3709 if (isTailCall && !IsSibcall && !IsMustTail) {
3710 // Lower arguments at fp - stackoffset + fpdiff.
3711 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3712
3713 FPDiff = NumBytesCallerPushed - NumBytes;
3714
3715 // Set the delta of movement of the returnaddr stackslot.
3716 // But only set if delta is greater than previous delta.
3717 if (FPDiff < X86Info->getTCReturnAddrDelta())
3718 X86Info->setTCReturnAddrDelta(FPDiff);
3719 }
3720
3721 unsigned NumBytesToPush = NumBytes;
3722 unsigned NumBytesToPop = NumBytes;
3723
3724 // If we have an inalloca argument, all stack space has already been allocated
3725 // for us and be right at the top of the stack. We don't support multiple
3726 // arguments passed in memory when using inalloca.
3727 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3728 NumBytesToPush = 0;
3729 if (!ArgLocs.back().isMemLoc())
3730 report_fatal_error("cannot use inalloca attribute on a register "
3731 "parameter");
3732 if (ArgLocs.back().getLocMemOffset() != 0)
3733 report_fatal_error("any parameter with the inalloca attribute must be "
3734 "the only memory argument");
3735 }
3736
3737 if (!IsSibcall)
3738 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3739 NumBytes - NumBytesToPush, dl);
3740
3741 SDValue RetAddrFrIdx;
3742 // Load return address for tail calls.
3743 if (isTailCall && FPDiff)
3744 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3745 Is64Bit, FPDiff, dl);
3746
3747 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3748 SmallVector<SDValue, 8> MemOpChains;
3749 SDValue StackPtr;
3750
3751 // The next loop assumes that the locations are in the same order of the
3752 // input arguments.
3753 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3754, __PRETTY_FUNCTION__))
3754 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3754, __PRETTY_FUNCTION__))
;
3755
3756 // Walk the register/memloc assignments, inserting copies/loads. In the case
3757 // of tail call optimization arguments are handle later.
3758 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3759 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3760 ++I, ++OutIndex) {
3761 assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ?
static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3761, __PRETTY_FUNCTION__))
;
3762 // Skip inalloca arguments, they have already been written.
3763 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3764 if (Flags.isInAlloca())
3765 continue;
3766
3767 CCValAssign &VA = ArgLocs[I];
3768 EVT RegVT = VA.getLocVT();
3769 SDValue Arg = OutVals[OutIndex];
3770 bool isByVal = Flags.isByVal();
3771
3772 // Promote the value if needed.
3773 switch (VA.getLocInfo()) {
3774 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3774)
;
3775 case CCValAssign::Full: break;
3776 case CCValAssign::SExt:
3777 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3778 break;
3779 case CCValAssign::ZExt:
3780 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3781 break;
3782 case CCValAssign::AExt:
3783 if (Arg.getValueType().isVector() &&
3784 Arg.getValueType().getVectorElementType() == MVT::i1)
3785 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3786 else if (RegVT.is128BitVector()) {
3787 // Special case: passing MMX values in XMM registers.
3788 Arg = DAG.getBitcast(MVT::i64, Arg);
3789 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3790 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3791 } else
3792 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3793 break;
3794 case CCValAssign::BCvt:
3795 Arg = DAG.getBitcast(RegVT, Arg);
3796 break;
3797 case CCValAssign::Indirect: {
3798 if (isByVal) {
3799 // Memcpy the argument to a temporary stack slot to prevent
3800 // the caller from seeing any modifications the callee may make
3801 // as guaranteed by the `byval` attribute.
3802 int FrameIdx = MF.getFrameInfo().CreateStackObject(
3803 Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
3804 false);
3805 SDValue StackSlot =
3806 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
3807 Chain =
3808 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
3809 // From now on treat this as a regular pointer
3810 Arg = StackSlot;
3811 isByVal = false;
3812 } else {
3813 // Store the argument.
3814 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3815 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3816 Chain = DAG.getStore(
3817 Chain, dl, Arg, SpillSlot,
3818 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3819 Arg = SpillSlot;
3820 }
3821 break;
3822 }
3823 }
3824
3825 if (VA.needsCustom()) {
3826 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3827, __PRETTY_FUNCTION__))
3827 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3827, __PRETTY_FUNCTION__))
;
3828 // Split v64i1 value into two registers
3829 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
3830 } else if (VA.isRegLoc()) {
3831 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3832 const TargetOptions &Options = DAG.getTarget().Options;
3833 if (Options.EnableDebugEntryValues)
3834 CSInfo.emplace_back(VA.getLocReg(), I);
3835 if (isVarArg && IsWin64) {
3836 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3837 // shadow reg if callee is a varargs function.
3838 unsigned ShadowReg = 0;
3839 switch (VA.getLocReg()) {
3840 case X86::XMM0: ShadowReg = X86::RCX; break;
3841 case X86::XMM1: ShadowReg = X86::RDX; break;
3842 case X86::XMM2: ShadowReg = X86::R8; break;
3843 case X86::XMM3: ShadowReg = X86::R9; break;
3844 }
3845 if (ShadowReg)
3846 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3847 }
3848 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3849 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3849, __PRETTY_FUNCTION__))
;
3850 if (!StackPtr.getNode())
3851 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3852 getPointerTy(DAG.getDataLayout()));
3853 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3854 dl, DAG, VA, Flags));
3855 }
3856 }
3857
3858 if (!MemOpChains.empty())
3859 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3860
3861 if (Subtarget.isPICStyleGOT()) {
3862 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3863 // GOT pointer.
3864 if (!isTailCall) {
3865 RegsToPass.push_back(std::make_pair(
3866 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3867 getPointerTy(DAG.getDataLayout()))));
3868 } else {
3869 // If we are tail calling and generating PIC/GOT style code load the
3870 // address of the callee into ECX. The value in ecx is used as target of
3871 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3872 // for tail calls on PIC/GOT architectures. Normally we would just put the
3873 // address of GOT into ebx and then call target@PLT. But for tail calls
3874 // ebx would be restored (since ebx is callee saved) before jumping to the
3875 // target@PLT.
3876
3877 // Note: The actual moving to ECX is done further down.
3878 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3879 if (G && !G->getGlobal()->hasLocalLinkage() &&
3880 G->getGlobal()->hasDefaultVisibility())
3881 Callee = LowerGlobalAddress(Callee, DAG);
3882 else if (isa<ExternalSymbolSDNode>(Callee))
3883 Callee = LowerExternalSymbol(Callee, DAG);
3884 }
3885 }
3886
3887 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3888 // From AMD64 ABI document:
3889 // For calls that may call functions that use varargs or stdargs
3890 // (prototype-less calls or calls to functions containing ellipsis (...) in
3891 // the declaration) %al is used as hidden argument to specify the number
3892 // of SSE registers used. The contents of %al do not need to match exactly
3893 // the number of registers, but must be an ubound on the number of SSE
3894 // registers used and is in the range 0 - 8 inclusive.
3895
3896 // Count the number of XMM registers allocated.
3897 static const MCPhysReg XMMArgRegs[] = {
3898 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3899 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3900 };
3901 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3902 assert((Subtarget.hasSSE1() || !NumXMMRegs)(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3903, __PRETTY_FUNCTION__))
3903 && "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3903, __PRETTY_FUNCTION__))
;
3904
3905 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3906 DAG.getConstant(NumXMMRegs, dl,
3907 MVT::i8)));
3908 }
3909
3910 if (isVarArg && IsMustTail) {
3911 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3912 for (const auto &F : Forwards) {
3913 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3914 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3915 }
3916 }
3917
3918 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3919 // don't need this because the eligibility check rejects calls that require
3920 // shuffling arguments passed in memory.
3921 if (!IsSibcall && isTailCall) {
3922 // Force all the incoming stack arguments to be loaded from the stack
3923 // before any new outgoing arguments are stored to the stack, because the
3924 // outgoing stack slots may alias the incoming argument stack slots, and
3925 // the alias isn't otherwise explicit. This is slightly more conservative
3926 // than necessary, because it means that each store effectively depends
3927 // on every argument instead of just those arguments it would clobber.
3928 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3929
3930 SmallVector<SDValue, 8> MemOpChains2;
3931 SDValue FIN;
3932 int FI = 0;
3933 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3934 ++I, ++OutsIndex) {
3935 CCValAssign &VA = ArgLocs[I];
3936
3937 if (VA.isRegLoc()) {
3938 if (VA.needsCustom()) {
3939 assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3940, __PRETTY_FUNCTION__))
3940 "Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3940, __PRETTY_FUNCTION__))
;
3941 // This means that we are in special case where one argument was
3942 // passed through two register locations - Skip the next location
3943 ++I;
3944 }
3945
3946 continue;
3947 }
3948
3949 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3949, __PRETTY_FUNCTION__))
;
3950 SDValue Arg = OutVals[OutsIndex];
3951 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3952 // Skip inalloca arguments. They don't require any work.
3953 if (Flags.isInAlloca())
3954 continue;
3955 // Create frame index.
3956 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3957 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3958 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3959 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3960
3961 if (Flags.isByVal()) {
3962 // Copy relative to framepointer.
3963 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3964 if (!StackPtr.getNode())
3965 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3966 getPointerTy(DAG.getDataLayout()));
3967 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3968 StackPtr, Source);
3969
3970 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3971 ArgChain,
3972 Flags, DAG, dl));
3973 } else {
3974 // Store relative to framepointer.
3975 MemOpChains2.push_back(DAG.getStore(
3976 ArgChain, dl, Arg, FIN,
3977 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3978 }
3979 }
3980
3981 if (!MemOpChains2.empty())
3982 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3983
3984 // Store the return address to the appropriate stack slot.
3985 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3986 getPointerTy(DAG.getDataLayout()),
3987 RegInfo->getSlotSize(), FPDiff, dl);
3988 }
3989
3990 // Build a sequence of copy-to-reg nodes chained together with token chain
3991 // and flag operands which copy the outgoing args into registers.
3992 SDValue InFlag;
3993 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3994 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3995 RegsToPass[i].second, InFlag);
3996 InFlag = Chain.getValue(1);
3997 }
3998
3999 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4000 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4000, __PRETTY_FUNCTION__))
;
4001 // In the 64-bit large code model, we have to make all calls
4002 // through a register, since the call instruction's 32-bit
4003 // pc-relative offset may not be large enough to hold the whole
4004 // address.
4005 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4006 Callee->getOpcode() == ISD::ExternalSymbol) {
4007 // Lower direct calls to global addresses and external symbols. Setting
4008 // ForCall to true here has the effect of removing WrapperRIP when possible
4009 // to allow direct calls to be selected without first materializing the
4010 // address into a register.
4011 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4012 } else if (Subtarget.isTarget64BitILP32() &&
4013 Callee->getValueType(0) == MVT::i32) {
4014 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4015 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4016 }
4017
4018 // Returns a chain & a flag for retval copy to use.
4019 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4020 SmallVector<SDValue, 8> Ops;
4021
4022 if (!IsSibcall && isTailCall) {
4023 Chain = DAG.getCALLSEQ_END(Chain,
4024 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4025 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4026 InFlag = Chain.getValue(1);
4027 }
4028
4029 Ops.push_back(Chain);
4030 Ops.push_back(Callee);
4031
4032 if (isTailCall)
4033 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
4034
4035 // Add argument registers to the end of the list so that they are known live
4036 // into the call.
4037 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4038 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4039 RegsToPass[i].second.getValueType()));
4040
4041 // Add a register mask operand representing the call-preserved registers.
4042 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
4043 // set X86_INTR calling convention because it has the same CSR mask
4044 // (same preserved registers).
4045 const uint32_t *Mask = RegInfo->getCallPreservedMask(
4046 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
4047 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4047, __PRETTY_FUNCTION__))
;
4048
4049 // If this is an invoke in a 32-bit function using a funclet-based
4050 // personality, assume the function clobbers all registers. If an exception
4051 // is thrown, the runtime will not restore CSRs.
4052 // FIXME: Model this more precisely so that we can register allocate across
4053 // the normal edge and spill and fill across the exceptional edge.
4054 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
4055 const Function &CallerFn = MF.getFunction();
4056 EHPersonality Pers =
4057 CallerFn.hasPersonalityFn()
4058 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4059 : EHPersonality::Unknown;
4060 if (isFuncletEHPersonality(Pers))
4061 Mask = RegInfo->getNoPreservedMask();
4062 }
4063
4064 // Define a new register mask from the existing mask.
4065 uint32_t *RegMask = nullptr;
4066
4067 // In some calling conventions we need to remove the used physical registers
4068 // from the reg mask.
4069 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4070 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4071
4072 // Allocate a new Reg Mask and copy Mask.
4073 RegMask = MF.allocateRegMask();
4074 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4075 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4076
4077 // Make sure all sub registers of the argument registers are reset
4078 // in the RegMask.
4079 for (auto const &RegPair : RegsToPass)
4080 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4081 SubRegs.isValid(); ++SubRegs)
4082 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4083
4084 // Create the RegMask Operand according to our updated mask.
4085 Ops.push_back(DAG.getRegisterMask(RegMask));
4086 } else {
4087 // Create the RegMask Operand according to the static mask.
4088 Ops.push_back(DAG.getRegisterMask(Mask));
4089 }
4090
4091 if (InFlag.getNode())
4092 Ops.push_back(InFlag);
4093
4094 if (isTailCall) {
4095 // We used to do:
4096 //// If this is the first return lowered for this function, add the regs
4097 //// to the liveout set for the function.
4098 // This isn't right, although it's probably harmless on x86; liveouts
4099 // should be computed from returns not tail calls. Consider a void
4100 // function making a tail call to a function returning int.
4101 MF.getFrameInfo().setHasTailCall();
4102 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4103 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4104 return Ret;
4105 }
4106
4107 if (HasNoCfCheck && IsCFProtectionSupported) {
4108 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4109 } else {
4110 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4111 }
4112 InFlag = Chain.getValue(1);
4113 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4114
4115 // Save heapallocsite metadata.
4116 if (CLI.CS)
4117 if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
4118 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4119
4120 // Create the CALLSEQ_END node.
4121 unsigned NumBytesForCalleeToPop;
4122 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4123 DAG.getTarget().Options.GuaranteedTailCallOpt))
4124 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4125 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4126 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4127 SR == StackStructReturn)
4128 // If this is a call to a struct-return function, the callee
4129 // pops the hidden struct pointer, so we have to push it back.
4130 // This is common for Darwin/X86, Linux & Mingw32 targets.
4131 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4132 NumBytesForCalleeToPop = 4;
4133 else
4134 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4135
4136 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
4137 // No need to reset the stack after the call if the call doesn't return. To
4138 // make the MI verify, we'll pretend the callee does it for us.
4139 NumBytesForCalleeToPop = NumBytes;
4140 }
4141
4142 // Returns a flag for retval copy to use.
4143 if (!IsSibcall) {
4144 Chain = DAG.getCALLSEQ_END(Chain,
4145 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4146 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4147 true),
4148 InFlag, dl);
4149 InFlag = Chain.getValue(1);
4150 }
4151
4152 // Handle result values, copying them out of physregs into vregs that we
4153 // return.
4154 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4155 InVals, RegMask);
4156}
4157
4158//===----------------------------------------------------------------------===//
4159// Fast Calling Convention (tail call) implementation
4160//===----------------------------------------------------------------------===//
4161
4162// Like std call, callee cleans arguments, convention except that ECX is
4163// reserved for storing the tail called function address. Only 2 registers are
4164// free for argument passing (inreg). Tail call optimization is performed
4165// provided:
4166// * tailcallopt is enabled
4167// * caller/callee are fastcc
4168// On X86_64 architecture with GOT-style position independent code only local
4169// (within module) calls are supported at the moment.
4170// To keep the stack aligned according to platform abi the function
4171// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4172// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4173// If a tail called function callee has more arguments than the caller the
4174// caller needs to make sure that there is room to move the RETADDR to. This is
4175// achieved by reserving an area the size of the argument delta right after the
4176// original RETADDR, but before the saved framepointer or the spilled registers
4177// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4178// stack layout:
4179// arg1
4180// arg2
4181// RETADDR
4182// [ new RETADDR
4183// move area ]
4184// (possible EBP)
4185// ESI
4186// EDI
4187// local1 ..
4188
4189/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4190/// requirement.
4191unsigned
4192X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4193 SelectionDAG &DAG) const {
4194 const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
4195 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4196 assert(StackSize % SlotSize == 0 &&((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4197, __PRETTY_FUNCTION__))
4197 "StackSize must be a multiple of SlotSize")((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4197, __PRETTY_FUNCTION__))
;
4198 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4199}
4200
4201/// Return true if the given stack call argument is already available in the
4202/// same position (relatively) of the caller's incoming argument stack.
4203static
4204bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4205 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4206 const X86InstrInfo *TII, const CCValAssign &VA) {
4207 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4208
4209 for (;;) {
4210 // Look through nodes that don't alter the bits of the incoming value.
4211 unsigned Op = Arg.getOpcode();
4212 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4213 Arg = Arg.getOperand(0);
4214 continue;
4215 }
4216 if (Op == ISD::TRUNCATE) {
4217 const SDValue &TruncInput = Arg.getOperand(0);
4218 if (TruncInput.getOpcode() == ISD::AssertZext &&
4219 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4220 Arg.getValueType()) {
4221 Arg = TruncInput.getOperand(0);
4222 continue;
4223 }
4224 }
4225 break;
4226 }
4227
4228 int FI = INT_MAX2147483647;
4229 if (Arg.getOpcode() == ISD::CopyFromReg) {
4230 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4231 if (!Register::isVirtualRegister(VR))
4232 return false;
4233 MachineInstr *Def = MRI->getVRegDef(VR);
4234 if (!Def)
4235 return false;
4236 if (!Flags.isByVal()) {
4237 if (!TII->isLoadFromStackSlot(*Def, FI))
4238 return false;
4239 } else {
4240 unsigned Opcode = Def->getOpcode();
4241 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4242 Opcode == X86::LEA64_32r) &&
4243 Def->getOperand(1).isFI()) {
4244 FI = Def->getOperand(1).getIndex();
4245 Bytes = Flags.getByValSize();
4246 } else
4247 return false;
4248 }
4249 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4250 if (Flags.isByVal())
4251 // ByVal argument is passed in as a pointer but it's now being
4252 // dereferenced. e.g.
4253 // define @foo(%struct.X* %A) {
4254 // tail call @bar(%struct.X* byval %A)
4255 // }
4256 return false;
4257 SDValue Ptr = Ld->getBasePtr();
4258 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4259 if (!FINode)
4260 return false;
4261 FI = FINode->getIndex();
4262 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4263 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4264 FI = FINode->getIndex();
4265 Bytes = Flags.getByValSize();
4266 } else
4267 return false;
4268
4269 assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4269, __PRETTY_FUNCTION__))
;
4270 if (!MFI.isFixedObjectIndex(FI))
4271 return false;
4272
4273 if (Offset != MFI.getObjectOffset(FI))
4274 return false;
4275
4276 // If this is not byval, check that the argument stack object is immutable.
4277 // inalloca and argument copy elision can create mutable argument stack
4278 // objects. Byval objects can be mutated, but a byval call intends to pass the
4279 // mutated memory.
4280 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4281 return false;
4282
4283 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4284 // If the argument location is wider than the argument type, check that any
4285 // extension flags match.
4286 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4287 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4288 return false;
4289 }
4290 }
4291
4292 return Bytes == MFI.getObjectSize(FI);
4293}
4294
4295/// Check whether the call is eligible for tail call optimization. Targets
4296/// that want to do tail call optimization should implement this function.
4297bool X86TargetLowering::IsEligibleForTailCallOptimization(
4298 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4299 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4300 const SmallVectorImpl<ISD::OutputArg> &Outs,
4301 const SmallVectorImpl<SDValue> &OutVals,
4302 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4303 if (!mayTailCallThisCC(CalleeCC))
4304 return false;
4305
4306 // If -tailcallopt is specified, make fastcc functions tail-callable.
4307 MachineFunction &MF = DAG.getMachineFunction();
4308 const Function &CallerF = MF.getFunction();
4309
4310 // If the function return type is x86_fp80 and the callee return type is not,
4311 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4312 // perform a tailcall optimization here.
4313 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4314 return false;
4315
4316 CallingConv::ID CallerCC = CallerF.getCallingConv();
4317 bool CCMatch = CallerCC == CalleeCC;
4318 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4319 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4320 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4321 CalleeCC == CallingConv::Tail;
4322
4323 // Win64 functions have extra shadow space for argument homing. Don't do the
4324 // sibcall if the caller and callee have mismatched expectations for this
4325 // space.
4326 if (IsCalleeWin64 != IsCallerWin64)
4327 return false;
4328
4329 if (IsGuaranteeTCO) {
4330 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4331 return true;
4332 return false;
4333 }
4334
4335 // Look for obvious safe cases to perform tail call optimization that do not
4336 // require ABI changes. This is what gcc calls sibcall.
4337
4338 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4339 // emit a special epilogue.
4340 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4341 if (RegInfo->needsStackRealignment(MF))
4342 return false;
4343
4344 // Also avoid sibcall optimization if either caller or callee uses struct
4345 // return semantics.
4346 if (isCalleeStructRet || isCallerStructRet)
4347 return false;
4348
4349 // Do not sibcall optimize vararg calls unless all arguments are passed via
4350 // registers.
4351 LLVMContext &C = *DAG.getContext();
4352 if (isVarArg && !Outs.empty()) {
4353 // Optimizing for varargs on Win64 is unlikely to be safe without
4354 // additional testing.
4355 if (IsCalleeWin64 || IsCallerWin64)
4356 return false;
4357
4358 SmallVector<CCValAssign, 16> ArgLocs;
4359 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4360
4361 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4362 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4363 if (!ArgLocs[i].isRegLoc())
4364 return false;
4365 }
4366
4367 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4368 // stack. Therefore, if it's not used by the call it is not safe to optimize
4369 // this into a sibcall.
4370 bool Unused = false;
4371 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4372 if (!Ins[i].Used) {
4373 Unused = true;
4374 break;
4375 }
4376 }
4377 if (Unused) {
4378 SmallVector<CCValAssign, 16> RVLocs;
4379 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4380 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4381 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4382 CCValAssign &VA = RVLocs[i];
4383 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4384 return false;
4385 }
4386 }
4387
4388 // Check that the call results are passed in the same way.
4389 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4390 RetCC_X86, RetCC_X86))
4391 return false;
4392 // The callee has to preserve all registers the caller needs to preserve.
4393 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4394 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4395 if (!CCMatch) {
4396 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4397 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4398 return false;
4399 }
4400
4401 unsigned StackArgsSize = 0;
4402
4403 // If the callee takes no arguments then go on to check the results of the
4404 // call.
4405 if (!Outs.empty()) {
4406 // Check if stack adjustment is needed. For now, do not do this if any
4407 // argument is passed on the stack.
4408 SmallVector<CCValAssign, 16> ArgLocs;
4409 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4410
4411 // Allocate shadow area for Win64
4412 if (IsCalleeWin64)
4413 CCInfo.AllocateStack(32, 8);
4414
4415 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4416 StackArgsSize = CCInfo.getNextStackOffset();
4417
4418 if (CCInfo.getNextStackOffset()) {
4419 // Check if the arguments are already laid out in the right way as
4420 // the caller's fixed stack objects.
4421 MachineFrameInfo &MFI = MF.getFrameInfo();
4422 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4423 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4424 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4425 CCValAssign &VA = ArgLocs[i];
4426 SDValue Arg = OutVals[i];
4427 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4428 if (VA.getLocInfo() == CCValAssign::Indirect)
4429 return false;
4430 if (!VA.isRegLoc()) {
4431 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4432 MFI, MRI, TII, VA))
4433 return false;
4434 }
4435 }
4436 }
4437
4438 bool PositionIndependent = isPositionIndependent();
4439 // If the tailcall address may be in a register, then make sure it's
4440 // possible to register allocate for it. In 32-bit, the call address can
4441 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4442 // callee-saved registers are restored. These happen to be the same
4443 // registers used to pass 'inreg' arguments so watch out for those.
4444 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4445 !isa<ExternalSymbolSDNode>(Callee)) ||
4446 PositionIndependent)) {
4447 unsigned NumInRegs = 0;
4448 // In PIC we need an extra register to formulate the address computation
4449 // for the callee.
4450 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4451
4452 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4453 CCValAssign &VA = ArgLocs[i];
4454 if (!VA.isRegLoc())
4455 continue;
4456 Register Reg = VA.getLocReg();
4457 switch (Reg) {
4458 default: break;
4459 case X86::EAX: case X86::EDX: case X86::ECX:
4460 if (++NumInRegs == MaxInRegs)
4461 return false;
4462 break;
4463 }
4464 }
4465 }
4466
4467 const MachineRegisterInfo &MRI = MF.getRegInfo();
4468 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4469 return false;
4470 }
4471
4472 bool CalleeWillPop =
4473 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4474 MF.getTarget().Options.GuaranteedTailCallOpt);
4475
4476 if (unsigned BytesToPop =
4477 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4478 // If we have bytes to pop, the callee must pop them.
4479 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4480 if (!CalleePopMatches)
4481 return false;
4482 } else if (CalleeWillPop && StackArgsSize > 0) {
4483 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4484 return false;
4485 }
4486
4487 return true;
4488}
4489
4490FastISel *
4491X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4492 const TargetLibraryInfo *libInfo) const {
4493 return X86::createFastISel(funcInfo, libInfo);
4494}
4495
4496//===----------------------------------------------------------------------===//
4497// Other Lowering Hooks
4498//===----------------------------------------------------------------------===//
4499
4500static bool MayFoldLoad(SDValue Op) {
4501 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4502}
4503
4504static bool MayFoldIntoStore(SDValue Op) {
4505 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4506}
4507
4508static bool MayFoldIntoZeroExtend(SDValue Op) {
4509 if (Op.hasOneUse()) {
4510 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4511 return (ISD::ZERO_EXTEND == Opcode);
4512 }
4513 return false;
4514}
4515
4516static bool isTargetShuffle(unsigned Opcode) {
4517 switch(Opcode) {
4518 default: return false;
4519 case X86ISD::BLENDI:
4520 case X86ISD::PSHUFB:
4521 case X86ISD::PSHUFD:
4522 case X86ISD::PSHUFHW:
4523 case X86ISD::PSHUFLW:
4524 case X86ISD::SHUFP:
4525 case X86ISD::INSERTPS:
4526 case X86ISD::EXTRQI:
4527 case X86ISD::INSERTQI:
4528 case X86ISD::PALIGNR:
4529 case X86ISD::VSHLDQ:
4530 case X86ISD::VSRLDQ:
4531 case X86ISD::MOVLHPS:
4532 case X86ISD::MOVHLPS:
4533 case X86ISD::MOVSHDUP:
4534 case X86ISD::MOVSLDUP:
4535 case X86ISD::MOVDDUP:
4536 case X86ISD::MOVSS:
4537 case X86ISD::MOVSD:
4538 case X86ISD::UNPCKL:
4539 case X86ISD::UNPCKH:
4540 case X86ISD::VBROADCAST:
4541 case X86ISD::VPERMILPI:
4542 case X86ISD::VPERMILPV:
4543 case X86ISD::VPERM2X128:
4544 case X86ISD::SHUF128:
4545 case X86ISD::VPERMIL2:
4546 case X86ISD::VPERMI:
4547 case X86ISD::VPPERM:
4548 case X86ISD::VPERMV:
4549 case X86ISD::VPERMV3:
4550 case X86ISD::VZEXT_MOVL:
4551 return true;
4552 }
4553}
4554
4555static bool isTargetShuffleVariableMask(unsigned Opcode) {
4556 switch (Opcode) {
4557 default: return false;
4558 // Target Shuffles.
4559 case X86ISD::PSHUFB:
4560 case X86ISD::VPERMILPV:
4561 case X86ISD::VPERMIL2:
4562 case X86ISD::VPPERM:
4563 case X86ISD::VPERMV:
4564 case X86ISD::VPERMV3:
4565 return true;
4566 // 'Faux' Target Shuffles.
4567 case ISD::OR:
4568 case ISD::AND:
4569 case X86ISD::ANDNP:
4570 return true;
4571 }
4572}
4573
4574SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4575 MachineFunction &MF = DAG.getMachineFunction();
4576 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4577 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4578 int ReturnAddrIndex = FuncInfo->getRAIndex();
4579
4580 if (ReturnAddrIndex == 0) {
4581 // Set up a frame object for the return address.
4582 unsigned SlotSize = RegInfo->getSlotSize();
4583 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4584 -(int64_t)SlotSize,
4585 false);
4586 FuncInfo->setRAIndex(ReturnAddrIndex);
4587 }
4588
4589 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4590}
4591
4592bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4593 bool hasSymbolicDisplacement) {
4594 // Offset should fit into 32 bit immediate field.
4595 if (!isInt<32>(Offset))
4596 return false;
4597
4598 // If we don't have a symbolic displacement - we don't have any extra
4599 // restrictions.
4600 if (!hasSymbolicDisplacement)
4601 return true;
4602
4603 // FIXME: Some tweaks might be needed for medium code model.
4604 if (M != CodeModel::Small && M != CodeModel::Kernel)
4605 return false;
4606
4607 // For small code model we assume that latest object is 16MB before end of 31
4608 // bits boundary. We may also accept pretty large negative constants knowing
4609 // that all objects are in the positive half of address space.
4610 if (M == CodeModel::Small && Offset < 16*1024*1024)
4611 return true;
4612
4613 // For kernel code model we know that all object resist in the negative half
4614 // of 32bits address space. We may not accept negative offsets, since they may
4615 // be just off and we may accept pretty large positive ones.
4616 if (M == CodeModel::Kernel && Offset >= 0)
4617 return true;
4618
4619 return false;
4620}
4621
4622/// Determines whether the callee is required to pop its own arguments.
4623/// Callee pop is necessary to support tail calls.
4624bool X86::isCalleePop(CallingConv::ID CallingConv,
4625 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4626 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4627 // can guarantee TCO.
4628 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4629 return true;
4630
4631 switch (CallingConv) {
4632 default:
4633 return false;
4634 case CallingConv::X86_StdCall:
4635 case CallingConv::X86_FastCall:
4636 case CallingConv::X86_ThisCall:
4637 case CallingConv::X86_VectorCall:
4638 return !is64Bit;
4639 }
4640}
4641
4642/// Return true if the condition is an signed comparison operation.
4643static bool isX86CCSigned(unsigned X86CC) {
4644 switch (X86CC) {
4645 default:
4646 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4646)
;
4647 case X86::COND_E:
4648 case X86::COND_NE:
4649 case X86::COND_B:
4650 case X86::COND_A:
4651 case X86::COND_BE:
4652 case X86::COND_AE:
4653 return false;
4654 case X86::COND_G:
4655 case X86::COND_GE:
4656 case X86::COND_L:
4657 case X86::COND_LE:
4658 return true;
4659 }
4660}
4661
4662static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4663 switch (SetCCOpcode) {
4664 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4664)
;
4665 case ISD::SETEQ: return X86::COND_E;
4666 case ISD::SETGT: return X86::COND_G;
4667 case ISD::SETGE: return X86::COND_GE;
4668 case ISD::SETLT: return X86::COND_L;
4669 case ISD::SETLE: return X86::COND_LE;
4670 case ISD::SETNE: return X86::COND_NE;
4671 case ISD::SETULT: return X86::COND_B;
4672 case ISD::SETUGT: return X86::COND_A;
4673 case ISD::SETULE: return X86::COND_BE;
4674 case ISD::SETUGE: return X86::COND_AE;
4675 }
4676}
4677
4678/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4679/// condition code, returning the condition code and the LHS/RHS of the
4680/// comparison to make.
4681static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4682 bool isFP, SDValue &LHS, SDValue &RHS,
4683 SelectionDAG &DAG) {
4684 if (!isFP) {
4685 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4686 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4687 // X > -1 -> X == 0, jump !sign.
4688 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4689 return X86::COND_NS;
4690 }
4691 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4692 // X < 0 -> X == 0, jump on sign.
4693 return X86::COND_S;
4694 }
4695 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
4696 // X >= 0 -> X == 0, jump on !sign.
4697 return X86::COND_NS;
4698 }
4699 if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
4700 // X < 1 -> X <= 0
4701 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4702 return X86::COND_LE;
4703 }
4704 }
4705
4706 return TranslateIntegerX86CC(SetCCOpcode);
4707 }
4708
4709 // First determine if it is required or is profitable to flip the operands.
4710
4711 // If LHS is a foldable load, but RHS is not, flip the condition.
4712 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4713 !ISD::isNON_EXTLoad(RHS.getNode())) {
4714 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4715 std::swap(LHS, RHS);
4716 }
4717
4718 switch (SetCCOpcode) {
4719 default: break;
4720 case ISD::SETOLT:
4721 case ISD::SETOLE:
4722 case ISD::SETUGT:
4723 case ISD::SETUGE:
4724 std::swap(LHS, RHS);
4725 break;
4726 }
4727
4728 // On a floating point condition, the flags are set as follows:
4729 // ZF PF CF op
4730 // 0 | 0 | 0 | X > Y
4731 // 0 | 0 | 1 | X < Y
4732 // 1 | 0 | 0 | X == Y
4733 // 1 | 1 | 1 | unordered
4734 switch (SetCCOpcode) {
4735 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4735)
;
4736 case ISD::SETUEQ:
4737 case ISD::SETEQ: return X86::COND_E;
4738 case ISD::SETOLT: // flipped
4739 case ISD::SETOGT:
4740 case ISD::SETGT: return X86::COND_A;
4741 case ISD::SETOLE: // flipped
4742 case ISD::SETOGE:
4743 case ISD::SETGE: return X86::COND_AE;
4744 case ISD::SETUGT: // flipped
4745 case ISD::SETULT:
4746 case ISD::SETLT: return X86::COND_B;
4747 case ISD::SETUGE: // flipped
4748 case ISD::SETULE:
4749 case ISD::SETLE: return X86::COND_BE;
4750 case ISD::SETONE:
4751 case ISD::SETNE: return X86::COND_NE;
4752 case ISD::SETUO: return X86::COND_P;
4753 case ISD::SETO: return X86::COND_NP;
4754 case ISD::SETOEQ:
4755 case ISD::SETUNE: return X86::COND_INVALID;
4756 }
4757}
4758
4759/// Is there a floating point cmov for the specific X86 condition code?
4760/// Current x86 isa includes the following FP cmov instructions:
4761/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4762static bool hasFPCMov(unsigned X86CC) {
4763 switch (X86CC) {
4764 default:
4765 return false;
4766 case X86::COND_B:
4767 case X86::COND_BE:
4768 case X86::COND_E:
4769 case X86::COND_P:
4770 case X86::COND_A:
4771 case X86::COND_AE:
4772 case X86::COND_NE:
4773 case X86::COND_NP:
4774 return true;
4775 }
4776}
4777
4778
4779bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4780 const CallInst &I,
4781 MachineFunction &MF,
4782 unsigned Intrinsic) const {
4783
4784 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4785 if (!IntrData)
4786 return false;
4787
4788 Info.flags = MachineMemOperand::MONone;
4789 Info.offset = 0;
4790
4791 switch (IntrData->Type) {
4792 case TRUNCATE_TO_MEM_VI8:
4793 case TRUNCATE_TO_MEM_VI16:
4794 case TRUNCATE_TO_MEM_VI32: {
4795 Info.opc = ISD::INTRINSIC_VOID;
4796 Info.ptrVal = I.getArgOperand(0);
4797 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4798 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4799 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4800 ScalarVT = MVT::i8;
4801 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4802 ScalarVT = MVT::i16;
4803 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4804 ScalarVT = MVT::i32;
4805
4806 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4807 Info.align = Align::None();
4808 Info.flags |= MachineMemOperand::MOStore;
4809 break;
4810 }
4811 case GATHER:
4812 case GATHER_AVX2: {
4813 Info.opc = ISD::INTRINSIC_W_CHAIN;
4814 Info.ptrVal = nullptr;
4815 MVT DataVT = MVT::getVT(I.getType());
4816 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
4817 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
4818 IndexVT.getVectorNumElements());
4819 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
4820 Info.align = Align::None();
4821 Info.flags |= MachineMemOperand::MOLoad;
4822 break;
4823 }
4824 case SCATTER: {
4825 Info.opc = ISD::INTRINSIC_VOID;
4826 Info.ptrVal = nullptr;
4827 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
4828 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
4829 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
4830 IndexVT.getVectorNumElements());
4831 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
4832 Info.align = Align::None();
4833 Info.flags |= MachineMemOperand::MOStore;
4834 break;
4835 }
4836 default:
4837 return false;
4838 }
4839
4840 return true;
4841}
4842
4843/// Returns true if the target can instruction select the
4844/// specified FP immediate natively. If false, the legalizer will
4845/// materialize the FP immediate as a load from a constant pool.
4846bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
4847 bool ForCodeSize) const {
4848 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4849 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4850 return true;
4851 }
4852 return false;
4853}
4854
4855bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4856 ISD::LoadExtType ExtTy,
4857 EVT NewVT) const {
4858 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"
) ? static_cast<void> (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4858, __PRETTY_FUNCTION__))
;
4859
4860 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4861 // relocation target a movq or addq instruction: don't let the load shrink.
4862 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4863 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4864 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4865 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4866
4867 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
4868 // those uses are extracted directly into a store, then the extract + store
4869 // can be store-folded. Therefore, it's probably not worth splitting the load.
4870 EVT VT = Load->getValueType(0);
4871 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
4872 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
4873 // Skip uses of the chain value. Result 0 of the node is the load value.
4874 if (UI.getUse().getResNo() != 0)
4875 continue;
4876
4877 // If this use is not an extract + store, it's probably worth splitting.
4878 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
4879 UI->use_begin()->getOpcode() != ISD::STORE)
4880 return true;
4881 }
4882 // All non-chain uses are extract + store.
4883 return false;
4884 }
4885
4886 return true;
4887}
4888
4889/// Returns true if it is beneficial to convert a load of a constant
4890/// to just the constant itself.
4891bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4892 Type *Ty) const {
4893 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4893, __PRETTY_FUNCTION__))
;
4894
4895 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4896 if (BitSize == 0 || BitSize > 64)
4897 return false;
4898 return true;
4899}
4900
4901bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
4902 // If we are using XMM registers in the ABI and the condition of the select is
4903 // a floating-point compare and we have blendv or conditional move, then it is
4904 // cheaper to select instead of doing a cross-register move and creating a
4905 // load that depends on the compare result.
4906 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
4907 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
4908}
4909
4910bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4911 // TODO: It might be a win to ease or lift this restriction, but the generic
4912 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4913 if (VT.isVector() && Subtarget.hasAVX512())
4914 return false;
4915
4916 return true;
4917}
4918
4919bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
4920 SDValue C) const {
4921 // TODO: We handle scalars using custom code, but generic combining could make
4922 // that unnecessary.
4923 APInt MulC;
4924 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
4925 return false;
4926
4927 // Find the type this will be legalized too. Otherwise we might prematurely
4928 // convert this to shl+add/sub and then still have to type legalize those ops.
4929 // Another choice would be to defer the decision for illegal types until
4930 // after type legalization. But constant splat vectors of i64 can't make it
4931 // through type legalization on 32-bit targets so we would need to special
4932 // case vXi64.
4933 while (getTypeAction(Context, VT) != TypeLegal)
4934 VT = getTypeToTransformTo(Context, VT);
4935
4936 // If vector multiply is legal, assume that's faster than shl + add/sub.
4937 // TODO: Multiply is a complex op with higher latency and lower throughput in
4938 // most implementations, so this check could be loosened based on type
4939 // and/or a CPU attribute.
4940 if (isOperationLegal(ISD::MUL, VT))
4941 return false;
4942
4943 // shl+add, shl+sub, shl+add+neg
4944 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
4945 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
4946}
4947
4948bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
4949 bool IsSigned) const {
4950 // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
4951 return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
4952}
4953
4954bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4955 unsigned Index) const {
4956 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4957 return false;
4958
4959 // Mask vectors support all subregister combinations and operations that
4960 // extract half of vector.
4961 if (ResVT.getVectorElementType() == MVT::i1)
4962 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4963 (Index == ResVT.getVectorNumElements()));
4964
4965 return (Index % ResVT.getVectorNumElements()) == 0;
4966}
4967
4968bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
4969 unsigned Opc = VecOp.getOpcode();
4970
4971 // Assume target opcodes can't be scalarized.
4972 // TODO - do we have any exceptions?
4973 if (Opc >= ISD::BUILTIN_OP_END)
4974 return false;
4975
4976 // If the vector op is not supported, try to convert to scalar.
4977 EVT VecVT = VecOp.getValueType();
4978 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
4979 return true;
4980
4981 // If the vector op is supported, but the scalar op is not, the transform may
4982 // not be worthwhile.
4983 EVT ScalarVT = VecVT.getScalarType();
4984 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
4985}
4986
4987bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
4988 // TODO: Allow vectors?
4989 if (VT.isVector())
4990 return false;
4991 return VT.isSimple() || !isOperationExpand(Opcode, VT);
4992}
4993
4994bool X86TargetLowering::isCheapToSpeculateCttz() const {
4995 // Speculate cttz only if we can directly use TZCNT.
4996 return Subtarget.hasBMI();
4997}
4998
4999bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5000 // Speculate ctlz only if we can directly use LZCNT.
5001 return Subtarget.hasLZCNT();
5002}
5003
5004bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5005 const SelectionDAG &DAG,
5006 const MachineMemOperand &MMO) const {
5007 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5008 BitcastVT.getVectorElementType() == MVT::i1)
5009 return false;
5010
5011 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5012 return false;
5013
5014 // If both types are legal vectors, it's always ok to convert them.
5015 if (LoadVT.isVector() && BitcastVT.isVector() &&
5016 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5017 return true;
5018
5019 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5020}
5021
5022bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5023 const SelectionDAG &DAG) const {
5024 // Do not merge to float value size (128 bytes) if no implicit
5025 // float attribute is set.
5026 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5027 Attribute::NoImplicitFloat);
5028
5029 if (NoFloat) {
5030 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5031 return (MemVT.getSizeInBits() <= MaxIntSize);
5032 }
5033 // Make sure we don't merge greater than our preferred vector
5034 // width.
5035 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5036 return false;
5037 return true;
5038}
5039
5040bool X86TargetLowering::isCtlzFast() const {
5041 return Subtarget.hasFastLZCNT();
5042}
5043
5044bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5045 const Instruction &AndI) const {
5046 return true;
5047}
5048
5049bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5050 EVT VT = Y.getValueType();
5051
5052 if (VT.isVector())
5053 return false;
5054
5055 if (!Subtarget.hasBMI())
5056 return false;
5057
5058 // There are only 32-bit and 64-bit forms for 'andn'.
5059 if (VT != MVT::i32 && VT != MVT::i64)
5060 return false;
5061
5062 return !isa<ConstantSDNode>(Y);
5063}
5064
5065bool X86TargetLowering::hasAndNot(SDValue Y) const {
5066 EVT VT = Y.getValueType();
5067
5068 if (!VT.isVector())
5069 return hasAndNotCompare(Y);
5070
5071 // Vector.
5072
5073 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5074 return false;
5075
5076 if (VT == MVT::v4i32)
5077 return true;
5078
5079 return Subtarget.hasSSE2();
5080}
5081
5082bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5083 return X.getValueType().isScalarInteger(); // 'bt'
5084}
5085
5086bool X86TargetLowering::
5087 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5088 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5089 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5090 SelectionDAG &DAG) const {
5091 // Does baseline recommend not to perform the fold by default?
5092 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5093 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5094 return false;
5095 // For scalars this transform is always beneficial.
5096 if (X.getValueType().isScalarInteger())
5097 return true;
5098 // If all the shift amounts are identical, then transform is beneficial even
5099 // with rudimentary SSE2 shifts.
5100 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5101 return true;
5102 // If we have AVX2 with it's powerful shift operations, then it's also good.
5103 if (Subtarget.hasAVX2())
5104 return true;
5105 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5106 return NewShiftOpcode == ISD::SHL;
5107}
5108
5109bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5110 const SDNode *N, CombineLevel Level) const {
5111 assert(((N->getOpcode() == ISD::SHL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5115, __PRETTY_FUNCTION__))
5112 N->getOperand(0).getOpcode() == ISD::SRL) ||((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5115, __PRETTY_FUNCTION__))
5113 (N->getOpcode() == ISD::SRL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5115, __PRETTY_FUNCTION__))
5114 N->getOperand(0).getOpcode() == ISD::SHL)) &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5115, __PRETTY_FUNCTION__))
5115 "Expected shift-shift mask")((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5115, __PRETTY_FUNCTION__))
;
5116 EVT VT = N->getValueType(0);
5117 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5118 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5119 // Only fold if the shift values are equal - so it folds to AND.
5120 // TODO - we should fold if either is a non-uniform vector but we don't do
5121 // the fold for non-splats yet.
5122 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5123 }
5124 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5125}
5126
5127bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5128 EVT VT = Y.getValueType();
5129
5130 // For vectors, we don't have a preference, but we probably want a mask.
5131 if (VT.isVector())
5132 return false;
5133
5134 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5135 if (VT == MVT::i64 && !Subtarget.is64Bit())
5136 return false;
5137
5138 return true;
5139}
5140
5141bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5142 SDNode *N) const {
5143 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5144 !Subtarget.isOSWindows())
5145 return false;
5146 return true;
5147}
5148
5149bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5150 // Any legal vector type can be splatted more efficiently than
5151 // loading/spilling from memory.
5152 return isTypeLegal(VT);
5153}
5154
5155MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5156 MVT VT = MVT::getIntegerVT(NumBits);
5157 if (isTypeLegal(VT))
5158 return VT;
5159
5160 // PMOVMSKB can handle this.
5161 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5162 return MVT::v16i8;
5163
5164 // VPMOVMSKB can handle this.
5165 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5166 return MVT::v32i8;
5167
5168 // TODO: Allow 64-bit type for 32-bit target.
5169 // TODO: 512-bit types should be allowed, but make sure that those
5170 // cases are handled in combineVectorSizedSetCCEquality().
5171
5172 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5173}
5174
5175/// Val is the undef sentinel value or equal to the specified value.
5176static bool isUndefOrEqual(int Val, int CmpVal) {
5177 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5178}
5179
5180/// Val is either the undef or zero sentinel value.
5181static bool isUndefOrZero(int Val) {
5182 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5183}
5184
5185/// Return true if every element in Mask, beginning from position Pos and ending
5186/// in Pos+Size is the undef sentinel value.
5187static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5188 return llvm::all_of(Mask.slice(Pos, Size),
5189 [](int M) { return M == SM_SentinelUndef; });
5190}
5191
5192/// Return true if the mask creates a vector whose lower half is undefined.
5193static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5194 unsigned NumElts = Mask.size();
5195 return isUndefInRange(Mask, 0, NumElts / 2);
5196}
5197
5198/// Return true if the mask creates a vector whose upper half is undefined.
5199static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5200 unsigned NumElts = Mask.size();
5201 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5202}
5203
5204/// Return true if Val falls within the specified range (L, H].
5205static bool isInRange(int Val, int Low, int Hi) {
5206 return (Val >= Low && Val < Hi);
5207}
5208
5209/// Return true if the value of any element in Mask falls within the specified
5210/// range (L, H].
5211static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5212 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5213}
5214
5215/// Return true if Val is undef or if its value falls within the
5216/// specified range (L, H].
5217static bool isUndefOrInRange(int Val, int Low, int Hi) {
5218 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5219}
5220
5221/// Return true if every element in Mask is undef or if its value
5222/// falls within the specified range (L, H].
5223static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5224 return llvm::all_of(
5225 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5226}
5227
5228/// Return true if Val is undef, zero or if its value falls within the
5229/// specified range (L, H].
5230static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5231 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5232}
5233
5234/// Return true if every element in Mask is undef, zero or if its value
5235/// falls within the specified range (L, H].
5236static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5237 return llvm::all_of(
5238 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5239}
5240
5241/// Return true if every element in Mask, beginning
5242/// from position Pos and ending in Pos + Size, falls within the specified
5243/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5244static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5245 unsigned Size, int Low, int Step = 1) {
5246 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5247 if (!isUndefOrEqual(Mask[i], Low))
5248 return false;
5249 return true;
5250}
5251
5252/// Return true if every element in Mask, beginning
5253/// from position Pos and ending in Pos+Size, falls within the specified
5254/// sequential range (Low, Low+Size], or is undef or is zero.
5255static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5256 unsigned Size, int Low,
5257 int Step = 1) {
5258 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5259 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5260 return false;
5261 return true;
5262}
5263
5264/// Return true if every element in Mask, beginning
5265/// from position Pos and ending in Pos+Size is undef or is zero.
5266static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5267 unsigned Size) {
5268 return llvm::all_of(Mask.slice(Pos, Size),
5269 [](int M) { return isUndefOrZero(M); });
5270}
5271
5272/// Helper function to test whether a shuffle mask could be
5273/// simplified by widening the elements being shuffled.
5274///
5275/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5276/// leaves it in an unspecified state.
5277///
5278/// NOTE: This must handle normal vector shuffle masks and *target* vector
5279/// shuffle masks. The latter have the special property of a '-2' representing
5280/// a zero-ed lane of a vector.
5281static bool canWidenShuffleElements(ArrayRef<int> Mask,
5282 SmallVectorImpl<int> &WidenedMask) {
5283 WidenedMask.assign(Mask.size() / 2, 0);
5284 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5285 int M0 = Mask[i];
5286 int M1 = Mask[i + 1];
5287
5288 // If both elements are undef, its trivial.
5289 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5290 WidenedMask[i / 2] = SM_SentinelUndef;
5291 continue;
5292 }
5293
5294 // Check for an undef mask and a mask value properly aligned to fit with
5295 // a pair of values. If we find such a case, use the non-undef mask's value.
5296 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5297 WidenedMask[i / 2] = M1 / 2;
5298 continue;
5299 }
5300 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5301 WidenedMask[i / 2] = M0 / 2;
5302 continue;
5303 }
5304
5305 // When zeroing, we need to spread the zeroing across both lanes to widen.
5306 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5307 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5308 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5309 WidenedMask[i / 2] = SM_SentinelZero;
5310 continue;
5311 }
5312 return false;
5313 }
5314
5315 // Finally check if the two mask values are adjacent and aligned with
5316 // a pair.
5317 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5318 WidenedMask[i / 2] = M0 / 2;
5319 continue;
5320 }
5321
5322 // Otherwise we can't safely widen the elements used in this shuffle.
5323 return false;
5324 }
5325 assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5326, __PRETTY_FUNCTION__))
5326 "Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5326, __PRETTY_FUNCTION__))
;
5327
5328 return true;
5329}
5330
5331static bool canWidenShuffleElements(ArrayRef<int> Mask,
5332 const APInt &Zeroable,
5333 bool V2IsZero,
5334 SmallVectorImpl<int> &WidenedMask) {
5335 // Create an alternative mask with info about zeroable elements.
5336 // Here we do not set undef elements as zeroable.
5337 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5338 if (V2IsZero) {
5339 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((!Zeroable.isNullValue() && "V2's non-undef elements are used?!"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5339, __PRETTY_FUNCTION__))
;
5340 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5341 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5342 ZeroableMask[i] = SM_SentinelZero;
5343 }
5344 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5345}
5346
5347static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5348 SmallVector<int, 32> WidenedMask;
5349 return canWidenShuffleElements(Mask, WidenedMask);
5350}
5351
5352/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5353bool X86::isZeroNode(SDValue Elt) {
5354 return isNullConstant(Elt) || isNullFPConstant(Elt);
5355}
5356
5357// Build a vector of constants.
5358// Use an UNDEF node if MaskElt == -1.
5359// Split 64-bit constants in the 32-bit mode.
5360static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5361 const SDLoc &dl, bool IsMask = false) {
5362
5363 SmallVector<SDValue, 32> Ops;
5364 bool Split = false;
5365
5366 MVT ConstVecVT = VT;
5367 unsigned NumElts = VT.getVectorNumElements();
5368 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5369 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5370 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5371 Split = true;
5372 }
5373
5374 MVT EltVT = ConstVecVT.getVectorElementType();
5375 for (unsigned i = 0; i < NumElts; ++i) {
5376 bool IsUndef = Values[i] < 0 && IsMask;
5377 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5378 DAG.getConstant(Values[i], dl, EltVT);
5379 Ops.push_back(OpNode);
5380 if (Split)
5381 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5382 DAG.getConstant(0, dl, EltVT));
5383 }
5384 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5385 if (Split)
5386 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5387 return ConstsNode;
5388}
5389
5390static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5391 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5392 assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5393, __PRETTY_FUNCTION__))
5393 "Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5393, __PRETTY_FUNCTION__))
;
5394 SmallVector<SDValue, 32> Ops;
5395 bool Split = false;
5396
5397 MVT ConstVecVT = VT;
5398 unsigned NumElts = VT.getVectorNumElements();
5399 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5400 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5401 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5402 Split = true;
5403 }
5404
5405 MVT EltVT = ConstVecVT.getVectorElementType();
5406 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5407 if (Undefs[i]) {
5408 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5409 continue;
5410 }
5411 const APInt &V = Bits[i];
5412 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"
) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5412, __PRETTY_FUNCTION__))
;
5413 if (Split) {
5414 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5415 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5416 } else if (EltVT == MVT::f32) {
5417 APFloat FV(APFloat::IEEEsingle(), V);
5418 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5419 } else if (EltVT == MVT::f64) {
5420 APFloat FV(APFloat::IEEEdouble(), V);
5421 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5422 } else {
5423 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5424 }
5425 }
5426
5427 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5428 return DAG.getBitcast(VT, ConstsNode);
5429}
5430
5431/// Returns a vector of specified type with all zero elements.
5432static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5433 SelectionDAG &DAG, const SDLoc &dl) {
5434 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5436, __PRETTY_FUNCTION__))
5435 VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5436, __PRETTY_FUNCTION__))
5436 "Unexpected vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5436, __PRETTY_FUNCTION__))
;
5437
5438 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5439 // type. This ensures they get CSE'd. But if the integer type is not
5440 // available, use a floating-point +0.0 instead.
5441 SDValue Vec;
5442 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5443 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5444 } else if (VT.isFloatingPoint()) {
5445 Vec = DAG.getConstantFP(+0.0, dl, VT);
5446 } else if (VT.getVectorElementType() == MVT::i1) {
5447 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5448, __PRETTY_FUNCTION__))
5448 "Unexpected vector type")(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5448, __PRETTY_FUNCTION__))
;
5449 Vec = DAG.getConstant(0, dl, VT);
5450 } else {
5451 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5452 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5453 }
5454 return DAG.getBitcast(VT, Vec);
5455}
5456
5457static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5458 const SDLoc &dl, unsigned vectorWidth) {
5459 EVT VT = Vec.getValueType();
5460 EVT ElVT = VT.getVectorElementType();
5461 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5462 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5463 VT.getVectorNumElements()/Factor);
5464
5465 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5466 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5467 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5467, __PRETTY_FUNCTION__))
;
5468
5469 // This is the index of the first element of the vectorWidth-bit chunk
5470 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5471 IdxVal &= ~(ElemsPerChunk - 1);
5472
5473 // If the input is a buildvector just emit a smaller one.
5474 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5475 return DAG.getBuildVector(ResultVT, dl,
5476 Vec->ops().slice(IdxVal, ElemsPerChunk));
5477
5478 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5479 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5480}
5481
5482/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5483/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5484/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5485/// instructions or a simple subregister reference. Idx is an index in the
5486/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5487/// lowering EXTRACT_VECTOR_ELT operations easier.
5488static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5489 SelectionDAG &DAG, const SDLoc &dl) {
5490 assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5491, __PRETTY_FUNCTION__))
5491 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5491, __PRETTY_FUNCTION__))
;
5492 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5493}
5494
5495/// Generate a DAG to grab 256-bits from a 512-bit vector.
5496static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5497 SelectionDAG &DAG, const SDLoc &dl) {
5498 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5498, __PRETTY_FUNCTION__))
;
5499 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5500}
5501
5502static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5503 SelectionDAG &DAG, const SDLoc &dl,
5504 unsigned vectorWidth) {
5505 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5506, __PRETTY_FUNCTION__))
5506 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5506, __PRETTY_FUNCTION__))
;
5507 // Inserting UNDEF is Result
5508 if (Vec.isUndef())
5509 return Result;
5510 EVT VT = Vec.getValueType();
5511 EVT ElVT = VT.getVectorElementType();
5512 EVT ResultVT = Result.getValueType();
5513
5514 // Insert the relevant vectorWidth bits.
5515 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5516 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5516, __PRETTY_FUNCTION__))
;
5517
5518 // This is the index of the first element of the vectorWidth-bit chunk
5519 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5520 IdxVal &= ~(ElemsPerChunk - 1);
5521
5522 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5523 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5524}
5525
5526/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5527/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5528/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5529/// simple superregister reference. Idx is an index in the 128 bits
5530/// we want. It need not be aligned to a 128-bit boundary. That makes
5531/// lowering INSERT_VECTOR_ELT operations easier.
5532static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5533 SelectionDAG &DAG, const SDLoc &dl) {
5534 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5534, __PRETTY_FUNCTION__))
;
5535 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5536}
5537
5538/// Widen a vector to a larger size with the same scalar type, with the new
5539/// elements either zero or undef.
5540static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5541 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5542 const SDLoc &dl) {
5543 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5545, __PRETTY_FUNCTION__))
5544 Vec.getValueType().getScalarType() == VT.getScalarType() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5545, __PRETTY_FUNCTION__))
5545 "Unsupported vector widening type")((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5545, __PRETTY_FUNCTION__))
;
5546 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5547 : DAG.getUNDEF(VT);
5548 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5549 DAG.getIntPtrConstant(0, dl));
5550}
5551
5552/// Widen a vector to a larger size with the same scalar type, with the new
5553/// elements either zero or undef.
5554static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5555 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5556 const SDLoc &dl, unsigned WideSizeInBits) {
5557 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5559, __PRETTY_FUNCTION__))
5558 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5559, __PRETTY_FUNCTION__))
5559 "Unsupported vector widening type")((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5559, __PRETTY_FUNCTION__))
;
5560 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5561 MVT SVT = Vec.getSimpleValueType().getScalarType();
5562 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5563 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5564}
5565
5566// Helper function to collect subvector ops that are concated together,
5567// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5568// The subvectors in Ops are guaranteed to be the same type.
5569static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5570 assert(Ops.empty() && "Expected an empty ops vector")((Ops.empty() && "Expected an empty ops vector") ? static_cast
<void> (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5570, __PRETTY_FUNCTION__))
;
5571
5572 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5573 Ops.append(N->op_begin(), N->op_end());
5574 return true;
5575 }
5576
5577 if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
5578 isa<ConstantSDNode>(N->getOperand(2))) {
5579 SDValue Src = N->getOperand(0);
5580 SDValue Sub = N->getOperand(1);
5581 const APInt &Idx = N->getConstantOperandAPInt(2);
5582 EVT VT = Src.getValueType();
5583 EVT SubVT = Sub.getValueType();
5584
5585 // TODO - Handle more general insert_subvector chains.
5586 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
5587 Idx == (VT.getVectorNumElements() / 2) &&
5588 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5589 Src.getOperand(1).getValueType() == SubVT &&
5590 isNullConstant(Src.getOperand(2))) {
5591 Ops.push_back(Src.getOperand(1));
5592 Ops.push_back(Sub);
5593 return true;
5594 }
5595 }
5596
5597 return false;
5598}
5599
5600// Helper for splitting operands of an operation to legal target size and
5601// apply a function on each part.
5602// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5603// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5604// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5605// The argument Builder is a function that will be applied on each split part:
5606// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5607template <typename F>
5608SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5609 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5610 F Builder, bool CheckBWI = true) {
5611 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5611, __PRETTY_FUNCTION__))
;
5612 unsigned NumSubs = 1;
5613 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5614 (!CheckBWI && Subtarget.useAVX512Regs())) {
5615 if (VT.getSizeInBits() > 512) {
5616 NumSubs = VT.getSizeInBits() / 512;
5617 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5617, __PRETTY_FUNCTION__))
;
5618 }
5619 } else if (Subtarget.hasAVX2()) {
5620 if (VT.getSizeInBits() > 256) {
5621 NumSubs = VT.getSizeInBits() / 256;
5622 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5622, __PRETTY_FUNCTION__))
;
5623 }
5624 } else {
5625 if (VT.getSizeInBits() > 128) {
5626 NumSubs = VT.getSizeInBits() / 128;
5627 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5627, __PRETTY_FUNCTION__))
;
5628 }
5629 }
5630
5631 if (NumSubs == 1)
5632 return Builder(DAG, DL, Ops);
5633
5634 SmallVector<SDValue, 4> Subs;
5635 for (unsigned i = 0; i != NumSubs; ++i) {
5636 SmallVector<SDValue, 2> SubOps;
5637 for (SDValue Op : Ops) {
5638 EVT OpVT = Op.getValueType();
5639 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5640 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5641 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5642 }
5643 Subs.push_back(Builder(DAG, DL, SubOps));
5644 }
5645 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5646}
5647
5648/// Insert i1-subvector to i1-vector.
5649static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5650 const X86Subtarget &Subtarget) {
5651
5652 SDLoc dl(Op);
5653 SDValue Vec = Op.getOperand(0);
5654 SDValue SubVec = Op.getOperand(1);
5655 SDValue Idx = Op.getOperand(2);
5656
5657 if (!isa<ConstantSDNode>(Idx))
5658 return SDValue();
5659
5660 // Inserting undef is a nop. We can just return the original vector.
5661 if (SubVec.isUndef())
5662 return Vec;
5663
5664 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5665 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5666 return Op;
5667
5668 MVT OpVT = Op.getSimpleValueType();
5669 unsigned NumElems = OpVT.getVectorNumElements();
5670
5671 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5672
5673 // Extend to natively supported kshift.
5674 MVT WideOpVT = OpVT;
5675 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5676 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5677
5678 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5679 // if necessary.
5680 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5681 // May need to promote to a legal type.
5682 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5683 DAG.getConstant(0, dl, WideOpVT),
5684 SubVec, Idx);
5685 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5686 }
5687
5688 MVT SubVecVT = SubVec.getSimpleValueType();
5689 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5690
5691 assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5693, __PRETTY_FUNCTION__))
5692 IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5693, __PRETTY_FUNCTION__))
5693 "Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5693, __PRETTY_FUNCTION__))
;
5694
5695 SDValue Undef = DAG.getUNDEF(WideOpVT);
5696
5697 if (IdxVal == 0) {
5698 // Zero lower bits of the Vec
5699 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
5700 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5701 ZeroIdx);
5702 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5703 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5704 // Merge them together, SubVec should be zero extended.
5705 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5706 DAG.getConstant(0, dl, WideOpVT),
5707 SubVec, ZeroIdx);
5708 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5710 }
5711
5712 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5713 Undef, SubVec, ZeroIdx);
5714
5715 if (Vec.isUndef()) {
5716 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5716, __PRETTY_FUNCTION__))
;
5717 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5718 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5719 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5720 }
5721
5722 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5723 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5723, __PRETTY_FUNCTION__))
;
5724 NumElems = WideOpVT.getVectorNumElements();
5725 unsigned ShiftLeft = NumElems - SubVecNumElems;
5726 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5727 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5728 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5729 if (ShiftRight != 0)
5730 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5731 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5732 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5733 }
5734
5735 // Simple case when we put subvector in the upper part
5736 if (IdxVal + SubVecNumElems == NumElems) {
5737 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5738 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5739 if (SubVecNumElems * 2 == NumElems) {
5740 // Special case, use legal zero extending insert_subvector. This allows
5741 // isel to opimitize when bits are known zero.
5742 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5743 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5744 DAG.getConstant(0, dl, WideOpVT),
5745 Vec, ZeroIdx);
5746 } else {
5747 // Otherwise use explicit shifts to zero the bits.
5748 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5749 Undef, Vec, ZeroIdx);
5750 NumElems = WideOpVT.getVectorNumElements();
5751 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
5752 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5753 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5754 }
5755 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5756 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5757 }
5758
5759 // Inserting into the middle is more complicated.
5760
5761 NumElems = WideOpVT.getVectorNumElements();
5762
5763 // Widen the vector if needed.
5764 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5765
5766 // Clear the upper bits of the subvector and move it to its insert position.
5767 unsigned ShiftLeft = NumElems - SubVecNumElems;
5768 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5769 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5770 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5771 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5772 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5773
5774 // Isolate the bits below the insertion point.
5775 unsigned LowShift = NumElems - IdxVal;
5776 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
5777 DAG.getTargetConstant(LowShift, dl, MVT::i8));
5778 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
5779 DAG.getTargetConstant(LowShift, dl, MVT::i8));
5780
5781 // Isolate the bits after the last inserted bit.
5782 unsigned HighShift = IdxVal + SubVecNumElems;
5783 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5784 DAG.getTargetConstant(HighShift, dl, MVT::i8));
5785 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
5786 DAG.getTargetConstant(HighShift, dl, MVT::i8));
5787
5788 // Now OR all 3 pieces together.
5789 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
5790 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
5791
5792 // Reduce to original width if needed.
5793 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5794}
5795
5796static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
5797 const SDLoc &dl) {
5798 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((V1.getValueType() == V2.getValueType() && "subvector type mismatch"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5798, __PRETTY_FUNCTION__))
;
5799 EVT SubVT = V1.getValueType();
5800 EVT SubSVT = SubVT.getScalarType();
5801 unsigned SubNumElts = SubVT.getVectorNumElements();
5802 unsigned SubVectorWidth = SubVT.getSizeInBits();
5803 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
5804 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
5805 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
5806}
5807
5808/// Returns a vector of specified type with all bits set.
5809/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5810/// Then bitcast to their original type, ensuring they get CSE'd.
5811static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5812 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5813, __PRETTY_FUNCTION__))
5813 "Expected a 128/256/512-bit vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5813, __PRETTY_FUNCTION__))
;
5814
5815 APInt Ones = APInt::getAllOnesValue(32);
5816 unsigned NumElts = VT.getSizeInBits() / 32;
5817 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5818 return DAG.getBitcast(VT, Vec);
5819}
5820
5821// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
5822static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
5823 switch (Opcode) {
5824 case ISD::ANY_EXTEND:
5825 case ISD::ANY_EXTEND_VECTOR_INREG:
5826 return ISD::ANY_EXTEND_VECTOR_INREG;
5827 case ISD::ZERO_EXTEND:
5828 case ISD::ZERO_EXTEND_VECTOR_INREG:
5829 return ISD::ZERO_EXTEND_VECTOR_INREG;
5830 case ISD::SIGN_EXTEND:
5831 case ISD::SIGN_EXTEND_VECTOR_INREG:
5832 return ISD::SIGN_EXTEND_VECTOR_INREG;
5833 }
5834 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5834)
;
5835}
5836
5837static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
5838 SDValue In, SelectionDAG &DAG) {
5839 EVT InVT = In.getValueType();
5840 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((VT.isVector() && InVT.isVector() && "Expected vector VTs."
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5840, __PRETTY_FUNCTION__))
;
5841 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
5842 ISD::ZERO_EXTEND == Opcode) &&(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
5843 "Unknown extension opcode")(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
;
5844
5845 // For 256-bit vectors, we only need the lower (128-bit) input half.
5846 // For 512-bit vectors, we only need the lower input half or quarter.
5847 if (InVT.getSizeInBits() > 128) {
5848 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5849, __PRETTY_FUNCTION__))
5849 "Expected VTs to be the same size!")((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5849, __PRETTY_FUNCTION__))
;
5850 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5851 In = extractSubVector(In, 0, DAG, DL,
5852 std::max(128U, VT.getSizeInBits() / Scale));
5853 InVT = In.getValueType();
5854 }
5855
5856 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
5857 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
5858
5859 return DAG.getNode(Opcode, DL, VT, In);
5860}
5861
5862// Match (xor X, -1) -> X.
5863// Match extract_subvector(xor X, -1) -> extract_subvector(X).
5864// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
5865static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
5866 V = peekThroughBitcasts(V);
5867 if (V.getOpcode() == ISD::XOR &&
5868 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
5869 return V.getOperand(0);
5870 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5871 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5872 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5873 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5874 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5875 Not, V.getOperand(1));
5876 }
5877 }
5878 SmallVector<SDValue, 2> CatOps;
5879 if (collectConcatOps(V.getNode(), CatOps)) {
5880 for (SDValue &CatOp : CatOps) {
5881 SDValue NotCat = IsNOT(CatOp, DAG);
5882 if (!NotCat) return SDValue();
5883 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5884 }
5885 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5886 }
5887 return SDValue();
5888}
5889
5890/// Returns a vector_shuffle node for an unpackl operation.
5891static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5892 SDValue V1, SDValue V2) {
5893 SmallVector<int, 8> Mask;
5894 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5895 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5896}
5897
5898/// Returns a vector_shuffle node for an unpackh operation.
5899static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5900 SDValue V1, SDValue V2) {
5901 SmallVector<int, 8> Mask;
5902 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5903 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5904}
5905
5906/// Return a vector_shuffle of the specified vector of zero or undef vector.
5907/// This produces a shuffle where the low element of V2 is swizzled into the
5908/// zero/undef vector, landing at element Idx.
5909/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5910static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5911 bool IsZero,
5912 const X86Subtarget &Subtarget,
5913 SelectionDAG &DAG) {
5914 MVT VT = V2.getSimpleValueType();
5915 SDValue V1 = IsZero
5916 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5917 int NumElems = VT.getVectorNumElements();
5918 SmallVector<int, 16> MaskVec(NumElems);
5919 for (int i = 0; i != NumElems; ++i)
5920 // If this is the insertion idx, put the low elt of V2 here.
5921 MaskVec[i] = (i == Idx) ? NumElems : i;
5922 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5923}
5924
5925static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
5926 if (!Load || !ISD::isNormalLoad(Load))
5927 return nullptr;
5928
5929 SDValue Ptr = Load->getBasePtr();
5930 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5931 Ptr->getOpcode() == X86ISD::WrapperRIP)
5932 Ptr = Ptr->getOperand(0);
5933
5934 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5935 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5936 return nullptr;
5937
5938 return CNode->getConstVal();
5939}
5940
5941static const Constant *getTargetConstantFromNode(SDValue Op) {
5942 Op = peekThroughBitcasts(Op);
5943 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
5944}
5945
5946const Constant *
5947X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
5948 assert(LD && "Unexpected null LoadSDNode")((LD && "Unexpected null LoadSDNode") ? static_cast<
void> (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5948, __PRETTY_FUNCTION__))
;
5949 return getTargetConstantFromNode(LD);
5950}
5951
5952// Extract raw constant bits from constant pools.
5953static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5954 APInt &UndefElts,
5955 SmallVectorImpl<APInt> &EltBits,
5956 bool AllowWholeUndefs = true,
5957 bool AllowPartialUndefs = true) {
5958 assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector"
) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5958, __PRETTY_FUNCTION__))
;
5959
5960 Op = peekThroughBitcasts(Op);
5961
5962 EVT VT = Op.getValueType();
5963 unsigned SizeInBits = VT.getSizeInBits();
5964 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"
) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5964, __PRETTY_FUNCTION__))
;
5965 unsigned NumElts = SizeInBits / EltSizeInBits;
5966
5967 // Bitcast a source array of element bits to the target size.
5968 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5969 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5970 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5971 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5972, __PRETTY_FUNCTION__))
5972 "Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5972, __PRETTY_FUNCTION__))
;
5973
5974 // Don't split if we don't allow undef bits.
5975 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5976 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5977 return false;
5978
5979 // If we're already the right size, don't bother bitcasting.
5980 if (NumSrcElts == NumElts) {
5981 UndefElts = UndefSrcElts;
5982 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5983 return true;
5984 }
5985
5986 // Extract all the undef/constant element data and pack into single bitsets.
5987 APInt UndefBits(SizeInBits, 0);
5988 APInt MaskBits(SizeInBits, 0);
5989
5990 for (unsigned i = 0; i != NumSrcElts; ++i) {
5991 unsigned BitOffset = i * SrcEltSizeInBits;
5992 if (UndefSrcElts[i])
5993 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5994 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5995 }
5996
5997 // Split the undef/constant single bitset data into the target elements.
5998 UndefElts = APInt(NumElts, 0);
5999 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6000
6001 for (unsigned i = 0; i != NumElts; ++i) {
6002 unsigned BitOffset = i * EltSizeInBits;
6003 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6004
6005 // Only treat an element as UNDEF if all bits are UNDEF.
6006 if (UndefEltBits.isAllOnesValue()) {
6007 if (!AllowWholeUndefs)
6008 return false;
6009 UndefElts.setBit(i);
6010 continue;
6011 }
6012
6013 // If only some bits are UNDEF then treat them as zero (or bail if not
6014 // supported).
6015 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6016 return false;
6017
6018 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6019 }
6020 return true;
6021 };
6022
6023 // Collect constant bits and insert into mask/undef bit masks.
6024 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6025 unsigned UndefBitIndex) {
6026 if (!Cst)
6027 return false;
6028 if (isa<UndefValue>(Cst)) {
6029 Undefs.setBit(UndefBitIndex);
6030 return true;
6031 }
6032 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6033 Mask = CInt->getValue();
6034 return true;
6035 }
6036 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6037 Mask = CFP->getValueAPF().bitcastToAPInt();
6038 return true;
6039 }
6040 return false;
6041 };
6042
6043 // Handle UNDEFs.
6044 if (Op.isUndef()) {
6045 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6046 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6047 return CastBitData(UndefSrcElts, SrcEltBits);
6048 }
6049
6050 // Extract scalar constant bits.
6051 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6052 APInt UndefSrcElts = APInt::getNullValue(1);
6053 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6054 return CastBitData(UndefSrcElts, SrcEltBits);
6055 }
6056 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6057 APInt UndefSrcElts = APInt::getNullValue(1);
6058 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6059 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6060 return CastBitData(UndefSrcElts, SrcEltBits);
6061 }
6062
6063 // Extract constant bits from build vector.
6064 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6065 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6066 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6067
6068 APInt UndefSrcElts(NumSrcElts, 0);
6069 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6070 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6071 const SDValue &Src = Op.getOperand(i);
6072 if (Src.isUndef()) {
6073 UndefSrcElts.setBit(i);
6074 continue;
6075 }
6076 auto *Cst = cast<ConstantSDNode>(Src);
6077 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6078 }
6079 return CastBitData(UndefSrcElts, SrcEltBits);
6080 }
6081 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6082 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6083 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6084
6085 APInt UndefSrcElts(NumSrcElts, 0);
6086 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6087 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6088 const SDValue &Src = Op.getOperand(i);
6089 if (Src.isUndef()) {
6090 UndefSrcElts.setBit(i);
6091 continue;
6092 }
6093 auto *Cst = cast<ConstantFPSDNode>(Src);
6094 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6095 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6096 }
6097 return CastBitData(UndefSrcElts, SrcEltBits);
6098 }
6099
6100 // Extract constant bits from constant pool vector.
6101 if (auto *Cst = getTargetConstantFromNode(Op)) {
6102 Type *CstTy = Cst->getType();
6103 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6104 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6105 return false;
6106
6107 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6108 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6109
6110 APInt UndefSrcElts(NumSrcElts, 0);
6111 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6112 for (unsigned i = 0; i != NumSrcElts; ++i)
6113 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6114 UndefSrcElts, i))
6115 return false;
6116
6117 return CastBitData(UndefSrcElts, SrcEltBits);
6118 }
6119
6120 // Extract constant bits from a broadcasted constant pool scalar.
6121 if (Op.getOpcode() == X86ISD::VBROADCAST &&
6122 EltSizeInBits <= VT.getScalarSizeInBits()) {
6123 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
6124 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
6125 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6126
6127 APInt UndefSrcElts(NumSrcElts, 0);
6128 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6129 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
6130 if (UndefSrcElts[0])
6131 UndefSrcElts.setBits(0, NumSrcElts);
6132 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6133 return CastBitData(UndefSrcElts, SrcEltBits);
6134 }
6135 }
6136 }
6137
6138 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6139 EltSizeInBits <= VT.getScalarSizeInBits()) {
6140 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6141 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6142 return false;
6143
6144 SDValue Ptr = MemIntr->getBasePtr();
6145 if (Ptr->getOpcode() == X86ISD::Wrapper ||
6146 Ptr->getOpcode() == X86ISD::WrapperRIP)
6147 Ptr = Ptr->getOperand(0);
6148
6149 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6150 if (!CNode || CNode->isMachineConstantPoolEntry() ||
6151 CNode->getOffset() != 0)
6152 return false;
6153
6154 if (const Constant *C = CNode->getConstVal()) {
6155 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6156 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6157
6158 APInt UndefSrcElts(NumSrcElts, 0);
6159 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6160 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6161 if (UndefSrcElts[0])
6162 UndefSrcElts.setBits(0, NumSrcElts);
6163 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6164 return CastBitData(UndefSrcElts, SrcEltBits);
6165 }
6166 }
6167 }
6168
6169 // Extract constant bits from a subvector broadcast.
6170 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
6171 SmallVector<APInt, 16> SubEltBits;
6172 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6173 UndefElts, SubEltBits, AllowWholeUndefs,
6174 AllowPartialUndefs)) {
6175 UndefElts = APInt::getSplat(NumElts, UndefElts);
6176 while (EltBits.size() < NumElts)
6177 EltBits.append(SubEltBits.begin(), SubEltBits.end());
6178 return true;
6179 }
6180 }
6181
6182 // Extract a rematerialized scalar constant insertion.
6183 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6184 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6185 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6186 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6187 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6188
6189 APInt UndefSrcElts(NumSrcElts, 0);
6190 SmallVector<APInt, 64> SrcEltBits;
6191 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6192 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6193 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6194 return CastBitData(UndefSrcElts, SrcEltBits);
6195 }
6196
6197 // Insert constant bits from a base and sub vector sources.
6198 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
6199 isa<ConstantSDNode>(Op.getOperand(2))) {
6200 // TODO - support insert_subvector through bitcasts.
6201 if (EltSizeInBits != VT.getScalarSizeInBits())
6202 return false;
6203
6204 APInt UndefSubElts;
6205 SmallVector<APInt, 32> EltSubBits;
6206 if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6207 UndefSubElts, EltSubBits,
6208 AllowWholeUndefs, AllowPartialUndefs) &&
6209 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6210 UndefElts, EltBits, AllowWholeUndefs,
6211 AllowPartialUndefs)) {
6212 unsigned BaseIdx = Op.getConstantOperandVal(2);
6213 UndefElts.insertBits(UndefSubElts, BaseIdx);
6214 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6215 EltBits[BaseIdx + i] = EltSubBits[i];
6216 return true;
6217 }
6218 }
6219
6220 // Extract constant bits from a subvector's source.
6221 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6222 isa<ConstantSDNode>(Op.getOperand(1))) {
6223 // TODO - support extract_subvector through bitcasts.
6224 if (EltSizeInBits != VT.getScalarSizeInBits())
6225 return false;
6226
6227 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6228 UndefElts, EltBits, AllowWholeUndefs,
6229 AllowPartialUndefs)) {
6230 EVT SrcVT = Op.getOperand(0).getValueType();
6231 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6232 unsigned NumSubElts = VT.getVectorNumElements();
6233 unsigned BaseIdx = Op.getConstantOperandVal(1);
6234 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6235 if ((BaseIdx + NumSubElts) != NumSrcElts)
6236 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6237 if (BaseIdx != 0)
6238 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6239 return true;
6240 }
6241 }
6242
6243 // Extract constant bits from shuffle node sources.
6244 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6245 // TODO - support shuffle through bitcasts.
6246 if (EltSizeInBits != VT.getScalarSizeInBits())
6247 return false;
6248
6249 ArrayRef<int> Mask = SVN->getMask();
6250 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6251 llvm::any_of(Mask, [](int M) { return M < 0; }))
6252 return false;
6253
6254 APInt UndefElts0, UndefElts1;
6255 SmallVector<APInt, 32> EltBits0, EltBits1;
6256 if (isAnyInRange(Mask, 0, NumElts) &&
6257 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6258 UndefElts0, EltBits0, AllowWholeUndefs,
6259 AllowPartialUndefs))
6260 return false;
6261 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6262 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6263 UndefElts1, EltBits1, AllowWholeUndefs,
6264 AllowPartialUndefs))
6265 return false;
6266
6267 UndefElts = APInt::getNullValue(NumElts);
6268 for (int i = 0; i != (int)NumElts; ++i) {
6269 int M = Mask[i];
6270 if (M < 0) {
6271 UndefElts.setBit(i);
6272 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6273 } else if (M < (int)NumElts) {
6274 if (UndefElts0[M])
6275 UndefElts.setBit(i);
6276 EltBits.push_back(EltBits0[M]);
6277 } else {
6278 if (UndefElts1[M - NumElts])
6279 UndefElts.setBit(i);
6280 EltBits.push_back(EltBits1[M - NumElts]);
6281 }
6282 }
6283 return true;
6284 }
6285
6286 return false;
6287}
6288
6289namespace llvm {
6290namespace X86 {
6291bool isConstantSplat(SDValue Op, APInt &SplatVal) {
6292 APInt UndefElts;
6293 SmallVector<APInt, 16> EltBits;
6294 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6295 UndefElts, EltBits, true, false)) {
6296 int SplatIndex = -1;
6297 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6298 if (UndefElts[i])
6299 continue;
6300 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6301 SplatIndex = -1;
6302 break;
6303 }
6304 SplatIndex = i;
6305 }
6306 if (0 <= SplatIndex) {
6307 SplatVal = EltBits[SplatIndex];
6308 return true;
6309 }
6310 }
6311
6312 return false;
6313}
6314} // namespace X86
6315} // namespace llvm
6316
6317static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6318 unsigned MaskEltSizeInBits,
6319 SmallVectorImpl<uint64_t> &RawMask,
6320 APInt &UndefElts) {
6321 // Extract the raw target constant bits.
6322 SmallVector<APInt, 64> EltBits;
6323 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6324 EltBits, /* AllowWholeUndefs */ true,
6325 /* AllowPartialUndefs */ false))
6326 return false;
6327
6328 // Insert the extracted elements into the mask.
6329 for (APInt Elt : EltBits)
6330 RawMask.push_back(Elt.getZExtValue());
6331
6332 return true;
6333}
6334
6335/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6336/// Note: This ignores saturation, so inputs must be checked first.
6337static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6338 bool Unary) {
6339 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6339, __PRETTY_FUNCTION__))
;
6340 unsigned NumElts = VT.getVectorNumElements();
6341 unsigned NumLanes = VT.getSizeInBits() / 128;
6342 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6343 unsigned Offset = Unary ? 0 : NumElts;
6344
6345 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6346 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6347 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6348 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6349 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6350 }
6351}
6352
6353// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6354static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6355 APInt &DemandedLHS, APInt &DemandedRHS) {
6356 int NumLanes = VT.getSizeInBits() / 128;
6357 int NumElts = DemandedElts.getBitWidth();
6358 int NumInnerElts = NumElts / 2;
6359 int NumEltsPerLane = NumElts / NumLanes;
6360 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6361
6362 DemandedLHS = APInt::getNullValue(NumInnerElts);
6363 DemandedRHS = APInt::getNullValue(NumInnerElts);
6364
6365 // Map DemandedElts to the packed operands.
6366 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6367 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6368 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6369 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6370 if (DemandedElts[OuterIdx])
6371 DemandedLHS.setBit(InnerIdx);
6372 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6373 DemandedRHS.setBit(InnerIdx);
6374 }
6375 }
6376}
6377
6378// Split the demanded elts of a HADD/HSUB node between its operands.
6379static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6380 APInt &DemandedLHS, APInt &DemandedRHS) {
6381 int NumLanes = VT.getSizeInBits() / 128;
6382 int NumElts = DemandedElts.getBitWidth();
6383 int NumEltsPerLane = NumElts / NumLanes;
6384 int HalfEltsPerLane = NumEltsPerLane / 2;
6385
6386 DemandedLHS = APInt::getNullValue(NumElts);
6387 DemandedRHS = APInt::getNullValue(NumElts);
6388
6389 // Map DemandedElts to the horizontal operands.
6390 for (int Idx = 0; Idx != NumElts; ++Idx) {
6391 if (!DemandedElts[Idx])
6392 continue;
6393 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6394 int LocalIdx = Idx % NumEltsPerLane;
6395 if (LocalIdx < HalfEltsPerLane) {
6396 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6397 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6398 } else {
6399 LocalIdx -= HalfEltsPerLane;
6400 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6401 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6402 }
6403 }
6404}
6405
6406/// Calculates the shuffle mask corresponding to the target-specific opcode.
6407/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6408/// operands in \p Ops, and returns true.
6409/// Sets \p IsUnary to true if only one source is used. Note that this will set
6410/// IsUnary for shuffles which use a single input multiple times, and in those
6411/// cases it will adjust the mask to only have indices within that single input.
6412/// It is an error to call this with non-empty Mask/Ops vectors.
6413static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6414 SmallVectorImpl<SDValue> &Ops,
6415 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6416 unsigned NumElems = VT.getVectorNumElements();
6417 unsigned MaskEltSize = VT.getScalarSizeInBits();
6418 SmallVector<uint64_t, 32> RawMask;
6419 APInt RawUndefs;
6420 SDValue ImmN;
6421
6422 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6422, __PRETTY_FUNCTION__))
;
6423 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6423, __PRETTY_FUNCTION__))
;
6424
6425 IsUnary = false;
6426 bool IsFakeUnary = false;
6427 switch (N->getOpcode()) {
6428 case X86ISD::BLENDI:
6429 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6429, __PRETTY_FUNCTION__))
;
6430 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6430, __PRETTY_FUNCTION__))
;
6431 ImmN = N->getOperand(N->getNumOperands() - 1);
6432 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6433 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6434 break;
6435 case X86ISD::SHUFP:
6436 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6436, __PRETTY_FUNCTION__))
;
6437 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6437, __PRETTY_FUNCTION__))
;
6438 ImmN = N->getOperand(N->getNumOperands() - 1);
6439 DecodeSHUFPMask(NumElems, MaskEltSize,
6440 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6441 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6442 break;
6443 case X86ISD::INSERTPS:
6444 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6444, __PRETTY_FUNCTION__))
;
6445 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6445, __PRETTY_FUNCTION__))
;
6446 ImmN = N->getOperand(N->getNumOperands() - 1);
6447 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6448 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6449 break;
6450 case X86ISD::EXTRQI:
6451 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6451, __PRETTY_FUNCTION__))
;
6452 if (isa<ConstantSDNode>(N->getOperand(1)) &&
6453 isa<ConstantSDNode>(N->getOperand(2))) {
6454 int BitLen = N->getConstantOperandVal(1);
6455 int BitIdx = N->getConstantOperandVal(2);
6456 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6457 IsUnary = true;
6458 }
6459 break;
6460 case X86ISD::INSERTQI:
6461 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6461, __PRETTY_FUNCTION__))
;
6462 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6462, __PRETTY_FUNCTION__))
;
6463 if (isa<ConstantSDNode>(N->getOperand(2)) &&
6464 isa<ConstantSDNode>(N->getOperand(3))) {
6465 int BitLen = N->getConstantOperandVal(2);
6466 int BitIdx = N->getConstantOperandVal(3);
6467 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6468 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6469 }
6470 break;
6471 case X86ISD::UNPCKH:
6472 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6472, __PRETTY_FUNCTION__))
;
6473 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6473, __PRETTY_FUNCTION__))
;
6474 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
6475 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6476 break;
6477 case X86ISD::UNPCKL:
6478 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6478, __PRETTY_FUNCTION__))
;
6479 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6479, __PRETTY_FUNCTION__))
;
6480 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
6481 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6482 break;
6483 case X86ISD::MOVHLPS:
6484 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6484, __PRETTY_FUNCTION__))
;
6485 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6485, __PRETTY_FUNCTION__))
;
6486 DecodeMOVHLPSMask(NumElems, Mask);
6487 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6488 break;
6489 case X86ISD::MOVLHPS:
6490 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6490, __PRETTY_FUNCTION__))
;
6491 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6491, __PRETTY_FUNCTION__))
;
6492 DecodeMOVLHPSMask(NumElems, Mask);
6493 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6494 break;
6495 case X86ISD::PALIGNR:
6496 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6496, __PRETTY_FUNCTION__))
;
6497 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6497, __PRETTY_FUNCTION__))
;
6498 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6498, __PRETTY_FUNCTION__))
;
6499 ImmN = N->getOperand(N->getNumOperands() - 1);
6500 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6501 Mask);
6502 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6503 Ops.push_back(N->getOperand(1));
6504 Ops.push_back(N->getOperand(0));
6505 break;
6506 case X86ISD::VSHLDQ:
6507 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6507, __PRETTY_FUNCTION__))
;
6508 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6508, __PRETTY_FUNCTION__))
;
6509 ImmN = N->getOperand(N->getNumOperands() - 1);
6510 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6511 Mask);
6512 IsUnary = true;
6513 break;
6514 case X86ISD::VSRLDQ:
6515 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6515, __PRETTY_FUNCTION__))
;
6516 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6516, __PRETTY_FUNCTION__))
;
6517 ImmN = N->getOperand(N->getNumOperands() - 1);
6518 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6519 Mask);
6520 IsUnary = true;
6521 break;
6522 case X86ISD::PSHUFD:
6523 case X86ISD::VPERMILPI:
6524 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6524, __PRETTY_FUNCTION__))
;
6525 ImmN = N->getOperand(N->getNumOperands() - 1);
6526 DecodePSHUFMask(NumElems, MaskEltSize,
6527 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6528 IsUnary = true;
6529 break;
6530 case X86ISD::PSHUFHW:
6531 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6531, __PRETTY_FUNCTION__))
;
6532 ImmN = N->getOperand(N->getNumOperands() - 1);
6533 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6534 Mask);
6535 IsUnary = true;
6536 break;
6537 case X86ISD::PSHUFLW:
6538 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6538, __PRETTY_FUNCTION__))
;
6539 ImmN = N->getOperand(N->getNumOperands() - 1);
6540 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6541 Mask);
6542 IsUnary = true;
6543 break;
6544 case X86ISD::VZEXT_MOVL:
6545 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6545, __PRETTY_FUNCTION__))
;
6546 DecodeZeroMoveLowMask(NumElems, Mask);
6547 IsUnary = true;
6548 break;
6549 case X86ISD::VBROADCAST: {
6550 SDValue N0 = N->getOperand(0);
6551 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
6552 // add the pre-extracted value to the Ops vector.
6553 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6554 N0.getOperand(0).getValueType() == VT &&
6555 N0.getConstantOperandVal(1) == 0)
6556 Ops.push_back(N0.getOperand(0));
6557
6558 // We only decode broadcasts of same-sized vectors, unless the broadcast
6559 // came from an extract from the original width. If we found one, we
6560 // pushed it the Ops vector above.
6561 if (N0.getValueType() == VT || !Ops.empty()) {
6562 DecodeVectorBroadcast(NumElems, Mask);
6563 IsUnary = true;
6564 break;
6565 }
6566 return false;
6567 }
6568 case X86ISD::VPERMILPV: {
6569 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6569, __PRETTY_FUNCTION__))
;
6570 IsUnary = true;
6571 SDValue MaskNode = N->getOperand(1);
6572 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6573 RawUndefs)) {
6574 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6575 break;
6576 }
6577 return false;
6578 }
6579 case X86ISD::PSHUFB: {
6580 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6580, __PRETTY_FUNCTION__))
;
6581 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6581, __PRETTY_FUNCTION__))
;
6582 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6582, __PRETTY_FUNCTION__))
;
6583 IsUnary = true;
6584 SDValue MaskNode = N->getOperand(1);
6585 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6586 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6587 break;
6588 }
6589 return false;
6590 }
6591 case X86ISD::VPERMI:
6592 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6592, __PRETTY_FUNCTION__))
;
6593 ImmN = N->getOperand(N->getNumOperands() - 1);
6594 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6595 IsUnary = true;
6596 break;
6597 case X86ISD::MOVSS:
6598 case X86ISD::MOVSD:
6599 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6599, __PRETTY_FUNCTION__))
;
6600 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6600, __PRETTY_FUNCTION__))
;
6601 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6602 break;
6603 case X86ISD::VPERM2X128:
6604 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6604, __PRETTY_FUNCTION__))
;
6605 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6605, __PRETTY_FUNCTION__))
;
6606 ImmN = N->getOperand(N->getNumOperands() - 1);
6607 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
6608 Mask);
6609 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6610 break;
6611 case X86ISD::SHUF128:
6612 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6612, __PRETTY_FUNCTION__))
;
6613 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6613, __PRETTY_FUNCTION__))
;
6614 ImmN = N->getOperand(N->getNumOperands() - 1);
6615 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
6616 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
6617 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6618 break;
6619 case X86ISD::MOVSLDUP:
6620 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6620, __PRETTY_FUNCTION__))
;
6621 DecodeMOVSLDUPMask(NumElems, Mask);
6622 IsUnary = true;
6623 break;
6624 case X86ISD::MOVSHDUP:
6625 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6625, __PRETTY_FUNCTION__))
;
6626 DecodeMOVSHDUPMask(NumElems, Mask);
6627 IsUnary = true;
6628 break;
6629 case X86ISD::MOVDDUP:
6630 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6630, __PRETTY_FUNCTION__))
;
6631 DecodeMOVDDUPMask(NumElems, Mask);
6632 IsUnary = true;
6633 break;
6634 case X86ISD::VPERMIL2: {
6635 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6635, __PRETTY_FUNCTION__))
;
6636 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6636, __PRETTY_FUNCTION__))
;
6637 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6638 SDValue MaskNode = N->getOperand(2);
6639 SDValue CtrlNode = N->getOperand(3);
6640 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6641 unsigned CtrlImm = CtrlOp->getZExtValue();
6642 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6643 RawUndefs)) {
6644 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6645 Mask);
6646 break;
6647 }
6648 }
6649 return false;
6650 }
6651 case X86ISD::VPPERM: {
6652 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6652, __PRETTY_FUNCTION__))
;
6653 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6653, __PRETTY_FUNCTION__))
;
6654 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6655 SDValue MaskNode = N->getOperand(2);
6656 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6657 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6658 break;
6659 }
6660 return false;
6661 }
6662 case X86ISD::VPERMV: {
6663 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6663, __PRETTY_FUNCTION__))
;
6664 IsUnary = true;
6665 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6666 Ops.push_back(N->getOperand(1));
6667 SDValue MaskNode = N->getOperand(0);
6668 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6669 RawUndefs)) {
6670 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6671 break;
6672 }
6673 return false;
6674 }
6675 case X86ISD::VPERMV3: {
6676 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6676, __PRETTY_FUNCTION__))
;
6677 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((N->getOperand(2).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6677, __PRETTY_FUNCTION__))
;
6678 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6679 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6680 Ops.push_back(N->getOperand(0));
6681 Ops.push_back(N->getOperand(2));
6682 SDValue MaskNode = N->getOperand(1);
6683 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6684 RawUndefs)) {
6685 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6686 break;
6687 }
6688 return false;
6689 }
6690 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6690)
;
6691 }
6692
6693 // Empty mask indicates the decode failed.
6694 if (Mask.empty())
6695 return false;
6696
6697 // Check if we're getting a shuffle mask with zero'd elements.
6698 if (!AllowSentinelZero)
6699 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6700 return false;
6701
6702 // If we have a fake unary shuffle, the shuffle mask is spread across two
6703 // inputs that are actually the same node. Re-map the mask to always point
6704 // into the first input.
6705 if (IsFakeUnary)
6706 for (int &M : Mask)
6707 if (M >= (int)Mask.size())
6708 M -= Mask.size();
6709
6710 // If we didn't already add operands in the opcode-specific code, default to
6711 // adding 1 or 2 operands starting at 0.
6712 if (Ops.empty()) {
6713 Ops.push_back(N->getOperand(0));
6714 if (!IsUnary || IsFakeUnary)
6715 Ops.push_back(N->getOperand(1));
6716 }
6717
6718 return true;
6719}
6720
6721/// Compute whether each element of a shuffle is zeroable.
6722///
6723/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6724/// Either it is an undef element in the shuffle mask, the element of the input
6725/// referenced is undef, or the element of the input referenced is known to be
6726/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6727/// as many lanes with this technique as possible to simplify the remaining
6728/// shuffle.
6729static void computeZeroableShuffleElements(ArrayRef<int> Mask,
6730 SDValue V1, SDValue V2,
6731 APInt &KnownUndef, APInt &KnownZero) {
6732 int Size = Mask.size();
6733 KnownUndef = KnownZero = APInt::getNullValue(Size);
6734
6735 V1 = peekThroughBitcasts(V1);
6736 V2 = peekThroughBitcasts(V2);
6737
6738 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6739 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6740
6741 int VectorSizeInBits = V1.getValueSizeInBits();
6742 int ScalarSizeInBits = VectorSizeInBits / Size;
6743 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6743, __PRETTY_FUNCTION__))
;
6744
6745 for (int i = 0; i < Size; ++i) {
6746 int M = Mask[i];
6747 // Handle the easy cases.
6748 if (M < 0) {
6749 KnownUndef.setBit(i);
6750 continue;
6751 }
6752 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6753 KnownZero.setBit(i);
6754 continue;
6755 }
6756
6757 // Determine shuffle input and normalize the mask.
6758 SDValue V = M < Size ? V1 : V2;
6759 M %= Size;
6760
6761 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6762 if (V.getOpcode() != ISD::BUILD_VECTOR)
6763 continue;
6764
6765 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6766 // the (larger) source element must be UNDEF/ZERO.
6767 if ((Size % V.getNumOperands()) == 0) {
6768 int Scale = Size / V->getNumOperands();
6769 SDValue Op = V.getOperand(M / Scale);
6770 if (Op.isUndef())
6771 KnownUndef.setBit(i);
6772 if (X86::isZeroNode(Op))
6773 KnownZero.setBit(i);
6774 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6775 APInt Val = Cst->getAPIntValue();
6776 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6777 if (Val == 0)
6778 KnownZero.setBit(i);
6779 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6780 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6781 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6782 if (Val == 0)
6783 KnownZero.setBit(i);
6784 }
6785 continue;
6786 }
6787
6788 // If the BUILD_VECTOR has more elements then all the (smaller) source
6789 // elements must be UNDEF or ZERO.
6790 if ((V.getNumOperands() % Size) == 0) {
6791 int Scale = V->getNumOperands() / Size;
6792 bool AllUndef = true;
6793 bool AllZero = true;
6794 for (int j = 0; j < Scale; ++j) {
6795 SDValue Op = V.getOperand((M * Scale) + j);
6796 AllUndef &= Op.isUndef();
6797 AllZero &= X86::isZeroNode(Op);
6798 }
6799 if (AllUndef)
6800 KnownUndef.setBit(i);
6801 if (AllZero)
6802 KnownZero.setBit(i);
6803 continue;
6804 }
6805 }
6806}
6807
6808/// Decode a target shuffle mask and inputs and see if any values are
6809/// known to be undef or zero from their inputs.
6810/// Returns true if the target shuffle mask was decoded.
6811/// FIXME: Merge this with computeZeroableShuffleElements?
6812static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
6813 SmallVectorImpl<SDValue> &Ops,
6814 APInt &KnownUndef, APInt &KnownZero) {
6815 bool IsUnary;
6816 if (!isTargetShuffle(N.getOpcode()))
6817 return false;
6818
6819 MVT VT = N.getSimpleValueType();
6820 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6821 return false;
6822
6823 int Size = Mask.size();
6824 SDValue V1 = Ops[0];
6825 SDValue V2 = IsUnary ? V1 : Ops[1];
6826 KnownUndef = KnownZero = APInt::getNullValue(Size);
6827
6828 V1 = peekThroughBitcasts(V1);
6829 V2 = peekThroughBitcasts(V2);
6830
6831 assert((VT.getSizeInBits() % Size) == 0 &&(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6832, __PRETTY_FUNCTION__))
6832 "Illegal split of shuffle value type")(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6832, __PRETTY_FUNCTION__))
;
6833 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6834
6835 // Extract known constant input data.
6836 APInt UndefSrcElts[2];
6837 SmallVector<APInt, 32> SrcEltBits[2];
6838 bool IsSrcConstant[2] = {
6839 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6840 SrcEltBits[0], true, false),
6841 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6842 SrcEltBits[1], true, false)};
6843
6844 for (int i = 0; i < Size; ++i) {
6845 int M = Mask[i];
6846
6847 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6848 if (M < 0) {
6849 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6849, __PRETTY_FUNCTION__))
;
6850 if (SM_SentinelUndef == M)
6851 KnownUndef.setBit(i);
6852 if (SM_SentinelZero == M)
6853 KnownZero.setBit(i);
6854 continue;
6855 }
6856
6857 // Determine shuffle input and normalize the mask.
6858 unsigned SrcIdx = M / Size;
6859 SDValue V = M < Size ? V1 : V2;
6860 M %= Size;
6861
6862 // We are referencing an UNDEF input.
6863 if (V.isUndef()) {
6864 KnownUndef.setBit(i);
6865 continue;
6866 }
6867
6868 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6869 // TODO: We currently only set UNDEF for integer types - floats use the same
6870 // registers as vectors and many of the scalar folded loads rely on the
6871 // SCALAR_TO_VECTOR pattern.
6872 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6873 (Size % V.getValueType().getVectorNumElements()) == 0) {
6874 int Scale = Size / V.getValueType().getVectorNumElements();
6875 int Idx = M / Scale;
6876 if (Idx != 0 && !VT.isFloatingPoint())
6877 KnownUndef.setBit(i);
6878 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6879 KnownZero.setBit(i);
6880 continue;
6881 }
6882
6883 // Attempt to extract from the source's constant bits.
6884 if (IsSrcConstant[SrcIdx]) {
6885 if (UndefSrcElts[SrcIdx][M])
6886 KnownUndef.setBit(i);
6887 else if (SrcEltBits[SrcIdx][M] == 0)
6888 KnownZero.setBit(i);
6889 }
6890 }
6891
6892 assert(VT.getVectorNumElements() == (unsigned)Size &&((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6893, __PRETTY_FUNCTION__))
6893 "Different mask size from vector size!")((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6893, __PRETTY_FUNCTION__))
;
6894 return true;
6895}
6896
6897// Replace target shuffle mask elements with known undef/zero sentinels.
6898static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
6899 const APInt &KnownUndef,
6900 const APInt &KnownZero,
6901 bool ResolveKnownZeros= true) {
6902 unsigned NumElts = Mask.size();
6903 assert(KnownUndef.getBitWidth() == NumElts &&((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6904, __PRETTY_FUNCTION__))
6904 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6904, __PRETTY_FUNCTION__))
;
6905
6906 for (unsigned i = 0; i != NumElts; ++i) {
6907 if (KnownUndef[i])
6908 Mask[i] = SM_SentinelUndef;
6909 else if (ResolveKnownZeros && KnownZero[i])
6910 Mask[i] = SM_SentinelZero;
6911 }
6912}
6913
6914// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6915static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
6916 APInt &KnownUndef,
6917 APInt &KnownZero) {
6918 unsigned NumElts = Mask.size();
6919 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
6920
6921 for (unsigned i = 0; i != NumElts; ++i) {
6922 int M = Mask[i];
6923 if (SM_SentinelUndef == M)
6924 KnownUndef.setBit(i);
6925 if (SM_SentinelZero == M)
6926 KnownZero.setBit(i);
6927 }
6928}
6929
6930// Forward declaration (for getFauxShuffleMask recursive check).
6931// TODO: Use DemandedElts variant.
6932static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
6933 SmallVectorImpl<int> &Mask,
6934 SelectionDAG &DAG, unsigned Depth,
6935 bool ResolveKnownElts);
6936
6937// Attempt to decode ops that could be represented as a shuffle mask.
6938// The decoded shuffle mask may contain a different number of elements to the
6939// destination value type.
6940static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6941 SmallVectorImpl<int> &Mask,
6942 SmallVectorImpl<SDValue> &Ops,
6943 SelectionDAG &DAG, unsigned Depth,
6944 bool ResolveKnownElts) {
6945 Mask.clear();
6946 Ops.clear();
6947
6948 MVT VT = N.getSimpleValueType();
6949 unsigned NumElts = VT.getVectorNumElements();
6950 unsigned NumSizeInBits = VT.getSizeInBits();
6951 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6952 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6953 return false;
6954 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6954, __PRETTY_FUNCTION__))
;
6955
6956 unsigned Opcode = N.getOpcode();
6957 switch (Opcode) {
6958 case ISD::VECTOR_SHUFFLE: {
6959 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6960 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6961 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6962 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6963 Ops.push_back(N.getOperand(0));
6964 Ops.push_back(N.getOperand(1));
6965 return true;
6966 }
6967 return false;
6968 }
6969 case ISD::AND:
6970 case X86ISD::ANDNP: {
6971 // Attempt to decode as a per-byte mask.
6972 APInt UndefElts;
6973 SmallVector<APInt, 32> EltBits;
6974 SDValue N0 = N.getOperand(0);
6975 SDValue N1 = N.getOperand(1);
6976 bool IsAndN = (X86ISD::ANDNP == Opcode);
6977 uint64_t ZeroMask = IsAndN ? 255 : 0;
6978 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6979 return false;
6980 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6981 if (UndefElts[i]) {
6982 Mask.push_back(SM_SentinelUndef);
6983 continue;
6984 }
6985 const APInt &ByteBits = EltBits[i];
6986 if (ByteBits != 0 && ByteBits != 255)
6987 return false;
6988 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6989 }
6990 Ops.push_back(IsAndN ? N1 : N0);
6991 return true;
6992 }
6993 case ISD::OR: {
6994 // Inspect each operand at the byte level. We can merge these into a
6995 // blend shuffle mask if for each byte at least one is masked out (zero).
6996 KnownBits Known0 =
6997 DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
6998 KnownBits Known1 =
6999 DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
7000 if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
7001 bool IsByteMask = true;
7002 unsigned NumSizeInBytes = NumSizeInBits / 8;
7003 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7004 APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
7005 APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
7006 for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
7007 unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
7008 unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
7009 if (LHS == 255 && RHS == 0)
7010 SelectMask.setBit(i);
7011 else if (LHS == 255 && RHS == 255)
7012 ZeroMask.setBit(i);
7013 else if (!(LHS == 0 && RHS == 255))
7014 IsByteMask = false;
7015 }
7016 if (IsByteMask) {
7017 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
7018 for (unsigned j = 0; j != NumBytesPerElt; ++j) {
7019 unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
7020 int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
7021 Mask.push_back(Idx);
7022 }
7023 }
7024 Ops.push_back(N.getOperand(0));
7025 Ops.push_back(N.getOperand(1));
7026 return true;
7027 }
7028 }
7029
7030 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7031 // is a valid shuffle index.
7032 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
7033 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
7034 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7035 return false;
7036 SmallVector<int, 64> SrcMask0, SrcMask1;
7037 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7038 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7039 true) ||
7040 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7041 true))
7042 return false;
7043 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7044 SmallVector<int, 64> Mask0, Mask1;
7045 scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7046 scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7047 for (size_t i = 0; i != MaskSize; ++i) {
7048 if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
7049 Mask.push_back(SM_SentinelUndef);
7050 else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7051 Mask.push_back(SM_SentinelZero);
7052 else if (Mask1[i] == SM_SentinelZero)
7053 Mask.push_back(Mask0[i]);
7054 else if (Mask0[i] == SM_SentinelZero)
7055 Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
7056 else
7057 return false;
7058 }
7059 Ops.append(SrcInputs0.begin(), SrcInputs0.end());
7060 Ops.append(SrcInputs1.begin(), SrcInputs1.end());
7061 return true;
7062 }
7063 case ISD::INSERT_SUBVECTOR: {
7064 SDValue Src = N.getOperand(0);
7065 SDValue Sub = N.getOperand(1);
7066 EVT SubVT = Sub.getValueType();
7067 unsigned NumSubElts = SubVT.getVectorNumElements();
7068 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7069 !N->isOnlyUserOf(Sub.getNode()))
7070 return false;
7071 uint64_t InsertIdx = N.getConstantOperandVal(2);
7072 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7073 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7074 Sub.getOperand(0).getValueType() == VT &&
7075 isa<ConstantSDNode>(Sub.getOperand(1))) {
7076 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7077 for (int i = 0; i != (int)NumElts; ++i)
7078 Mask.push_back(i);
7079 for (int i = 0; i != (int)NumSubElts; ++i)
7080 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7081 Ops.push_back(Src);
7082 Ops.push_back(Sub.getOperand(0));
7083 return true;
7084 }
7085 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7086 SmallVector<int, 64> SubMask;
7087 SmallVector<SDValue, 2> SubInputs;
7088 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7089 SubMask, DAG, Depth + 1, ResolveKnownElts))
7090 return false;
7091 if (SubMask.size() != NumSubElts) {
7092 assert(((SubMask.size() % NumSubElts) == 0 ||((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7093, __PRETTY_FUNCTION__))
7093 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7093, __PRETTY_FUNCTION__))
;
7094 if ((NumSubElts % SubMask.size()) == 0) {
7095 int Scale = NumSubElts / SubMask.size();
7096 SmallVector<int,64> ScaledSubMask;
7097 scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
7098 SubMask = ScaledSubMask;
7099 } else {
7100 int Scale = SubMask.size() / NumSubElts;
7101 NumSubElts = SubMask.size();
7102 NumElts *= Scale;
7103 InsertIdx *= Scale;
7104 }
7105 }
7106 Ops.push_back(Src);
7107 for (SDValue &SubInput : SubInputs) {
7108 EVT SubSVT = SubInput.getValueType().getScalarType();
7109 EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
7110 NumSizeInBits / SubSVT.getSizeInBits());
7111 Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
7112 DAG.getUNDEF(AltVT), SubInput,
7113 DAG.getIntPtrConstant(0, SDLoc(N))));
7114 }
7115 for (int i = 0; i != (int)NumElts; ++i)
7116 Mask.push_back(i);
7117 for (int i = 0; i != (int)NumSubElts; ++i) {
7118 int M = SubMask[i];
7119 if (0 <= M) {
7120 int InputIdx = M / NumSubElts;
7121 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7122 }
7123 Mask[i + InsertIdx] = M;
7124 }
7125 return true;
7126 }
7127 case ISD::SCALAR_TO_VECTOR: {
7128 // Match against a scalar_to_vector of an extract from a vector,
7129 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
7130 SDValue N0 = N.getOperand(0);
7131 SDValue SrcExtract;
7132
7133 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7134 N0.getOperand(0).getValueType() == VT) ||
7135 (N0.getOpcode() == X86ISD::PEXTRW &&
7136 N0.getOperand(0).getValueType() == MVT::v8i16) ||
7137 (N0.getOpcode() == X86ISD::PEXTRB &&
7138 N0.getOperand(0).getValueType() == MVT::v16i8)) {
7139 SrcExtract = N0;
7140 }
7141
7142 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7143 return false;
7144
7145 SDValue SrcVec = SrcExtract.getOperand(0);
7146 EVT SrcVT = SrcVec.getValueType();
7147 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7148 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
7149
7150 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7151 if (NumSrcElts <= SrcIdx)
7152 return false;
7153
7154 Ops.push_back(SrcVec);
7155 Mask.push_back(SrcIdx);
7156 Mask.append(NumZeros, SM_SentinelZero);
7157 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
7158 return true;
7159 }
7160 case X86ISD::PINSRB:
7161 case X86ISD::PINSRW: {
7162 SDValue InVec = N.getOperand(0);
7163 SDValue InScl = N.getOperand(1);
7164 SDValue InIndex = N.getOperand(2);
7165 if (!isa<ConstantSDNode>(InIndex) ||
7166 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
7167 return false;
7168 uint64_t InIdx = N.getConstantOperandVal(2);
7169
7170 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
7171 if (X86::isZeroNode(InScl)) {
7172 Ops.push_back(InVec);
7173 for (unsigned i = 0; i != NumElts; ++i)
7174 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
7175 return true;
7176 }
7177
7178 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
7179 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
7180 unsigned ExOp =
7181 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
7182 if (InScl.getOpcode() != ExOp)
7183 return false;
7184
7185 SDValue ExVec = InScl.getOperand(0);
7186 SDValue ExIndex = InScl.getOperand(1);
7187 if (!isa<ConstantSDNode>(ExIndex) ||
7188 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
7189 return false;
7190 uint64_t ExIdx = InScl.getConstantOperandVal(1);
7191
7192 Ops.push_back(InVec);
7193 Ops.push_back(ExVec);
7194 for (unsigned i = 0; i != NumElts; ++i)
7195 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
7196 return true;
7197 }
7198 case X86ISD::PACKSS:
7199 case X86ISD::PACKUS: {
7200 SDValue N0 = N.getOperand(0);
7201 SDValue N1 = N.getOperand(1);
7202 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7204, __PRETTY_FUNCTION__))
7203 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7204, __PRETTY_FUNCTION__))
7204 "Unexpected input value type")((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7204, __PRETTY_FUNCTION__))
;
7205
7206 APInt EltsLHS, EltsRHS;
7207 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7208
7209 // If we know input saturation won't happen we can treat this
7210 // as a truncation shuffle.
7211 if (Opcode == X86ISD::PACKSS) {
7212 if ((!N0.isUndef() &&
7213 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7214 (!N1.isUndef() &&
7215 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7216 return false;
7217 } else {
7218 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7219 if ((!N0.isUndef() &&
7220 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7221 (!N1.isUndef() &&
7222 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7223 return false;
7224 }
7225
7226 bool IsUnary = (N0 == N1);
7227
7228 Ops.push_back(N0);
7229 if (!IsUnary)
7230 Ops.push_back(N1);
7231
7232 createPackShuffleMask(VT, Mask, IsUnary);
7233 return true;
7234 }
7235 case X86ISD::VSHLI:
7236 case X86ISD::VSRLI: {
7237 uint64_t ShiftVal = N.getConstantOperandVal(1);
7238 // Out of range bit shifts are guaranteed to be zero.
7239 if (NumBitsPerElt <= ShiftVal) {
7240 Mask.append(NumElts, SM_SentinelZero);
7241 return true;
7242 }
7243
7244 // We can only decode 'whole byte' bit shifts as shuffles.
7245 if ((ShiftVal % 8) != 0)
7246 break;
7247
7248 uint64_t ByteShift = ShiftVal / 8;
7249 unsigned NumBytes = NumSizeInBits / 8;
7250 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7251 Ops.push_back(N.getOperand(0));
7252
7253 // Clear mask to all zeros and insert the shifted byte indices.
7254 Mask.append(NumBytes, SM_SentinelZero);
7255
7256 if (X86ISD::VSHLI == Opcode) {
7257 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7258 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7259 Mask[i + j] = i + j - ByteShift;
7260 } else {
7261 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7262 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7263 Mask[i + j - ByteShift] = i + j;
7264 }
7265 return true;
7266 }
7267 case X86ISD::VBROADCAST: {
7268 SDValue Src = N.getOperand(0);
7269 MVT SrcVT = Src.getSimpleValueType();
7270 if (!SrcVT.isVector())
7271 return false;
7272
7273 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7274 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __PRETTY_FUNCTION__))
7275 "Illegal broadcast type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __PRETTY_FUNCTION__))
;
7276 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
7277 NumSizeInBits / SrcVT.getScalarSizeInBits());
7278 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7279 DAG.getUNDEF(SrcVT), Src,
7280 DAG.getIntPtrConstant(0, SDLoc(N)));
7281 }
7282
7283 Ops.push_back(Src);
7284 Mask.append(NumElts, 0);
7285 return true;
7286 }
7287 case ISD::ZERO_EXTEND:
7288 case ISD::ANY_EXTEND:
7289 case ISD::ZERO_EXTEND_VECTOR_INREG:
7290 case ISD::ANY_EXTEND_VECTOR_INREG: {
7291 SDValue Src = N.getOperand(0);
7292 EVT SrcVT = Src.getValueType();
7293
7294 // Extended source must be a simple vector.
7295 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7296 (SrcVT.getScalarSizeInBits() % 8) != 0)
7297 return false;
7298
7299 unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
7300 bool IsAnyExtend =
7301 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7302 DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
7303 Mask);
7304
7305 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7306 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7307, __PRETTY_FUNCTION__))
7307 "Illegal zero-extension type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7307, __PRETTY_FUNCTION__))
;
7308 SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
7309 NumSizeInBits / NumSrcBitsPerElt);
7310 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7311 DAG.getUNDEF(SrcVT), Src,
7312 DAG.getIntPtrConstant(0, SDLoc(N)));
7313 }
7314
7315 Ops.push_back(Src);
7316 return true;
7317 }
7318 }
7319
7320 return false;
7321}
7322
7323/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7324static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7325 SmallVectorImpl<int> &Mask) {
7326 int MaskWidth = Mask.size();
7327 SmallVector<SDValue, 16> UsedInputs;
7328 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7329 int lo = UsedInputs.size() * MaskWidth;
7330 int hi = lo + MaskWidth;
7331
7332 // Strip UNDEF input usage.
7333 if (Inputs[i].isUndef())
7334 for (int &M : Mask)
7335 if ((lo <= M) && (M < hi))
7336 M = SM_SentinelUndef;
7337
7338 // Check for unused inputs.
7339 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7340 for (int &M : Mask)
7341 if (lo <= M)
7342 M -= MaskWidth;
7343 continue;
7344 }
7345
7346 // Check for repeated inputs.
7347 bool IsRepeat = false;
7348 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7349 if (UsedInputs[j] != Inputs[i])
7350 continue;
7351 for (int &M : Mask)
7352 if (lo <= M)
7353 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7354 IsRepeat = true;
7355 break;
7356 }
7357 if (IsRepeat)
7358 continue;
7359
7360 UsedInputs.push_back(Inputs[i]);
7361 }
7362 Inputs = UsedInputs;
7363}
7364
7365/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7366/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7367/// Returns true if the target shuffle mask was decoded.
7368static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7369 SmallVectorImpl<SDValue> &Inputs,
7370 SmallVectorImpl<int> &Mask,
7371 APInt &KnownUndef, APInt &KnownZero,
7372 SelectionDAG &DAG, unsigned Depth,
7373 bool ResolveKnownElts) {
7374 EVT VT = Op.getValueType();
7375 if (!VT.isSimple() || !VT.isVector())
18
Calling 'EVT::isSimple'
20
Returning from 'EVT::isSimple'
21
Calling 'EVT::isVector'
27
Returning from 'EVT::isVector'
28
Taking false branch
7376 return false;
7377
7378 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
29
Assuming the condition is true
30
Taking true branch
7379 if (ResolveKnownElts
30.1
'ResolveKnownElts' is false
30.1
'ResolveKnownElts' is false
30.1
'ResolveKnownElts' is false
30.1
'ResolveKnownElts' is false
)
31
Taking false branch
7380 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7381 return true;
32
Returning the value 1, which participates in a condition later
7382 }
7383 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7384 ResolveKnownElts)) {
7385 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7386 return true;
7387 }
7388 return false;
7389}
7390
7391static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7392 SmallVectorImpl<int> &Mask,
7393 SelectionDAG &DAG, unsigned Depth = 0,
7394 bool ResolveKnownElts = true) {
7395 EVT VT = Op.getValueType();
7396 if (!VT.isSimple() || !VT.isVector())
7397 return false;
7398
7399 APInt KnownUndef, KnownZero;
7400 unsigned NumElts = Op.getValueType().getVectorNumElements();
7401 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7402 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7403 KnownZero, DAG, Depth, ResolveKnownElts);
7404}
7405
7406/// Returns the scalar element that will make up the ith
7407/// element of the result of the vector shuffle.
7408static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
7409 unsigned Depth) {
7410 if (Depth == 6)
7411 return SDValue(); // Limit search depth.
7412
7413 SDValue V = SDValue(N, 0);
7414 EVT VT = V.getValueType();
7415 unsigned Opcode = V.getOpcode();
7416
7417 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7418 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
7419 int Elt = SV->getMaskElt(Index);
7420
7421 if (Elt < 0)
7422 return DAG.getUNDEF(VT.getVectorElementType());
7423
7424 unsigned NumElems = VT.getVectorNumElements();
7425 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
7426 : SV->getOperand(1);
7427 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
7428 }
7429
7430 // Recurse into target specific vector shuffles to find scalars.
7431 if (isTargetShuffle(Opcode)) {
7432 MVT ShufVT = V.getSimpleValueType();
7433 MVT ShufSVT = ShufVT.getVectorElementType();
7434 int NumElems = (int)ShufVT.getVectorNumElements();
7435 SmallVector<int, 16> ShuffleMask;
7436 SmallVector<SDValue, 16> ShuffleOps;
7437 bool IsUnary;
7438
7439 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
7440 return SDValue();
7441
7442 int Elt = ShuffleMask[Index];
7443 if (Elt == SM_SentinelZero)
7444 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
7445 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
7446 if (Elt == SM_SentinelUndef)
7447 return DAG.getUNDEF(ShufSVT);
7448
7449 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7449, __PRETTY_FUNCTION__))
;
7450 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7451 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
7452 Depth+1);
7453 }
7454
7455 // Recurse into insert_subvector base/sub vector to find scalars.
7456 if (Opcode == ISD::INSERT_SUBVECTOR &&
7457 isa<ConstantSDNode>(N->getOperand(2))) {
7458 SDValue Vec = N->getOperand(0);
7459 SDValue Sub = N->getOperand(1);
7460 EVT SubVT = Sub.getValueType();
7461 unsigned NumSubElts = SubVT.getVectorNumElements();
7462 uint64_t SubIdx = N->getConstantOperandVal(2);
7463
7464 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7465 return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
7466 return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
7467 }
7468
7469 // Recurse into extract_subvector src vector to find scalars.
7470 if (Opcode == ISD::EXTRACT_SUBVECTOR &&
7471 isa<ConstantSDNode>(N->getOperand(1))) {
7472 SDValue Src = N->getOperand(0);
7473 uint64_t SrcIdx = N->getConstantOperandVal(1);
7474 return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
7475 }
7476
7477 // Actual nodes that may contain scalar elements
7478 if (Opcode == ISD::BITCAST) {
7479 V = V.getOperand(0);
7480 EVT SrcVT = V.getValueType();
7481 unsigned NumElems = VT.getVectorNumElements();
7482
7483 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
7484 return SDValue();
7485 }
7486
7487 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
7488 return (Index == 0) ? V.getOperand(0)
7489 : DAG.getUNDEF(VT.getVectorElementType());
7490
7491 if (V.getOpcode() == ISD::BUILD_VECTOR)
7492 return V.getOperand(Index);
7493
7494 return SDValue();
7495}
7496
7497// Use PINSRB/PINSRW/PINSRD to create a build vector.
7498static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
7499 unsigned NumNonZero, unsigned NumZero,
7500 SelectionDAG &DAG,
7501 const X86Subtarget &Subtarget) {
7502 MVT VT = Op.getSimpleValueType();
7503 unsigned NumElts = VT.getVectorNumElements();
7504 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7506, __PRETTY_FUNCTION__))
7505 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7506, __PRETTY_FUNCTION__))
7506 "Illegal vector insertion")((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7506, __PRETTY_FUNCTION__))
;
7507
7508 SDLoc dl(Op);
7509 SDValue V;
7510 bool First = true;
7511
7512 for (unsigned i = 0; i < NumElts; ++i) {
7513 bool IsNonZero = (NonZeros & (1 << i)) != 0;
7514 if (!IsNonZero)
7515 continue;
7516
7517 // If the build vector contains zeros or our first insertion is not the
7518 // first index then insert into zero vector to break any register
7519 // dependency else use SCALAR_TO_VECTOR.
7520 if (First) {
7521 First = false;
7522 if (NumZero || 0 != i)
7523 V = getZeroVector(VT, Subtarget, DAG, dl);
7524 else {
7525 assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast
<void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7525, __PRETTY_FUNCTION__))
;
7526 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7527 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
7528 V = DAG.getBitcast(VT, V);
7529 continue;
7530 }
7531 }
7532 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
7533 DAG.getIntPtrConstant(i, dl));
7534 }
7535
7536 return V;
7537}
7538
7539/// Custom lower build_vector of v16i8.
7540static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
7541 unsigned NumNonZero, unsigned NumZero,
7542 SelectionDAG &DAG,
7543 const X86Subtarget &Subtarget) {
7544 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7545 return SDValue();
7546
7547 // SSE4.1 - use PINSRB to insert each byte directly.
7548 if (Subtarget.hasSSE41())
7549 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7550 Subtarget);
7551
7552 SDLoc dl(Op);
7553 SDValue V;
7554
7555 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7556 for (unsigned i = 0; i < 16; i += 2) {
7557 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
7558 bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
7559 if (!ThisIsNonZero && !NextIsNonZero)
7560 continue;
7561
7562 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
7563 SDValue Elt;
7564 if (ThisIsNonZero) {
7565 if (NumZero || NextIsNonZero)
7566 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7567 else
7568 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7569 }
7570
7571 if (NextIsNonZero) {
7572 SDValue NextElt = Op.getOperand(i + 1);
7573 if (i == 0 && NumZero)
7574 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
7575 else
7576 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
7577 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
7578 DAG.getConstant(8, dl, MVT::i8));
7579 if (ThisIsNonZero)
7580 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
7581 else
7582 Elt = NextElt;
7583 }
7584
7585 // If our first insertion is not the first index then insert into zero
7586 // vector to break any register dependency else use SCALAR_TO_VECTOR.
7587 if (!V) {
7588 if (i != 0)
7589 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
7590 else {
7591 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
7592 V = DAG.getBitcast(MVT::v8i16, V);
7593 continue;
7594 }
7595 }
7596 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
7597 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
7598 DAG.getIntPtrConstant(i / 2, dl));
7599 }
7600
7601 return DAG.getBitcast(MVT::v16i8, V);
7602}
7603
7604/// Custom lower build_vector of v8i16.
7605static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
7606 unsigned NumNonZero, unsigned NumZero,
7607 SelectionDAG &DAG,
7608 const X86Subtarget &Subtarget) {
7609 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7610 return SDValue();
7611
7612 // Use PINSRW to insert each byte directly.
7613 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7614 Subtarget);
7615}
7616
7617/// Custom lower build_vector of v4i32 or v4f32.
7618static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
7619 const X86Subtarget &Subtarget) {
7620 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7621 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7622 // Because we're creating a less complicated build vector here, we may enable
7623 // further folding of the MOVDDUP via shuffle transforms.
7624 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7625 Op.getOperand(0) == Op.getOperand(2) &&
7626 Op.getOperand(1) == Op.getOperand(3) &&
7627 Op.getOperand(0) != Op.getOperand(1)) {
7628 SDLoc DL(Op);
7629 MVT VT = Op.getSimpleValueType();
7630 MVT EltVT = VT.getVectorElementType();
7631 // Create a new build vector with the first 2 elements followed by undef
7632 // padding, bitcast to v2f64, duplicate, and bitcast back.
7633 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7634 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7635 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7636 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7637 return DAG.getBitcast(VT, Dup);
7638 }
7639
7640 // Find all zeroable elements.
7641 std::bitset<4> Zeroable, Undefs;
7642 for (int i = 0; i < 4; ++i) {
7643 SDValue Elt = Op.getOperand(i);
7644 Undefs[i] = Elt.isUndef();
7645 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7646 }
7647 assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7648, __PRETTY_FUNCTION__))
7648 "We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7648, __PRETTY_FUNCTION__))
;
7649
7650 // We only know how to deal with build_vector nodes where elements are either
7651 // zeroable or extract_vector_elt with constant index.
7652 SDValue FirstNonZero;
7653 unsigned FirstNonZeroIdx;
7654 for (unsigned i = 0; i < 4; ++i) {
7655 if (Zeroable[i])
7656 continue;
7657 SDValue Elt = Op.getOperand(i);
7658 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7659 !isa<ConstantSDNode>(Elt.getOperand(1)))
7660 return SDValue();
7661 // Make sure that this node is extracting from a 128-bit vector.
7662 MVT VT = Elt.getOperand(0).getSimpleValueType();
7663 if (!VT.is128BitVector())
7664 return SDValue();
7665 if (!FirstNonZero.getNode()) {
7666 FirstNonZero = Elt;
7667 FirstNonZeroIdx = i;
7668 }
7669 }
7670
7671 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7671, __PRETTY_FUNCTION__))
;
7672 SDValue V1 = FirstNonZero.getOperand(0);
7673 MVT VT = V1.getSimpleValueType();
7674
7675 // See if this build_vector can be lowered as a blend with zero.
7676 SDValue Elt;
7677 unsigned EltMaskIdx, EltIdx;
7678 int Mask[4];
7679 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7680 if (Zeroable[EltIdx]) {
7681 // The zero vector will be on the right hand side.
7682 Mask[EltIdx] = EltIdx+4;
7683 continue;
7684 }
7685
7686 Elt = Op->getOperand(EltIdx);
7687 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7688 EltMaskIdx = Elt.getConstantOperandVal(1);
7689 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7690 break;
7691 Mask[EltIdx] = EltIdx;
7692 }
7693
7694 if (EltIdx == 4) {
7695 // Let the shuffle legalizer deal with blend operations.
7696 SDValue VZeroOrUndef = (Zeroable == Undefs)
7697 ? DAG.getUNDEF(VT)
7698 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
7699 if (V1.getSimpleValueType() != VT)
7700 V1 = DAG.getBitcast(VT, V1);
7701 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7702 }
7703
7704 // See if we can lower this build_vector to a INSERTPS.
7705 if (!Subtarget.hasSSE41())
7706 return SDValue();
7707
7708 SDValue V2 = Elt.getOperand(0);
7709 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7710 V1 = SDValue();
7711
7712 bool CanFold = true;
7713 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7714 if (Zeroable[i])
7715 continue;
7716
7717 SDValue Current = Op->getOperand(i);
7718 SDValue SrcVector = Current->getOperand(0);
7719 if (!V1.getNode())
7720 V1 = SrcVector;
7721 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7722 }
7723
7724 if (!CanFold)
7725 return SDValue();
7726
7727 assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7727, __PRETTY_FUNCTION__))
;
7728 if (V1.getSimpleValueType() != MVT::v4f32)
7729 V1 = DAG.getBitcast(MVT::v4f32, V1);
7730 if (V2.getSimpleValueType() != MVT::v4f32)
7731 V2 = DAG.getBitcast(MVT::v4f32, V2);
7732
7733 // Ok, we can emit an INSERTPS instruction.
7734 unsigned ZMask = Zeroable.to_ulong();
7735
7736 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7737 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7737, __PRETTY_FUNCTION__))
;
7738 SDLoc DL(Op);
7739 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7740 DAG.getIntPtrConstant(InsertPSMask, DL, true));
7741 return DAG.getBitcast(VT, Result);
7742}
7743
7744/// Return a vector logical shift node.
7745static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7746 SelectionDAG &DAG, const TargetLowering &TLI,
7747 const SDLoc &dl) {
7748 assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7748, __PRETTY_FUNCTION__))
;
7749 MVT ShVT = MVT::v16i8;
7750 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7751 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7752 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts"
) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7752, __PRETTY_FUNCTION__))
;
7753 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7754 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7755}
7756
7757static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
7758 SelectionDAG &DAG) {
7759
7760 // Check if the scalar load can be widened into a vector load. And if
7761 // the address is "base + cst" see if the cst can be "absorbed" into
7762 // the shuffle mask.
7763 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7764 SDValue Ptr = LD->getBasePtr();
7765 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7766 return SDValue();
7767 EVT PVT = LD->getValueType(0);
7768 if (PVT != MVT::i32 && PVT != MVT::f32)
7769 return SDValue();
7770
7771 int FI = -1;
7772 int64_t Offset = 0;
7773 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7774 FI = FINode->getIndex();
7775 Offset = 0;
7776 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7777 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7778 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7779 Offset = Ptr.getConstantOperandVal(1);
7780 Ptr = Ptr.getOperand(0);
7781 } else {
7782 return SDValue();
7783 }
7784
7785 // FIXME: 256-bit vector instructions don't require a strict alignment,
7786 // improve this code to support it better.
7787 unsigned RequiredAlign = VT.getSizeInBits()/8;
7788 SDValue Chain = LD->getChain();
7789 // Make sure the stack object alignment is at least 16 or 32.
7790 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7791 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
7792 if (MFI.isFixedObjectIndex(FI)) {
7793 // Can't change the alignment. FIXME: It's possible to compute
7794 // the exact stack offset and reference FI + adjust offset instead.
7795 // If someone *really* cares about this. That's the way to implement it.
7796 return SDValue();
7797 } else {
7798 MFI.setObjectAlignment(FI, RequiredAlign);
7799 }
7800 }
7801
7802 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7803 // Ptr + (Offset & ~15).
7804 if (Offset < 0)
7805 return SDValue();
7806 if ((Offset % RequiredAlign) & 3)
7807 return SDValue();
7808 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
7809 if (StartOffset) {
7810 SDLoc DL(Ptr);
7811 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7812 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7813 }
7814
7815 int EltNo = (Offset - StartOffset) >> 2;
7816 unsigned NumElems = VT.getVectorNumElements();
7817
7818 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7819 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7820 LD->getPointerInfo().getWithOffset(StartOffset));
7821
7822 SmallVector<int, 8> Mask(NumElems, EltNo);
7823
7824 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7825 }
7826
7827 return SDValue();
7828}
7829
7830// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7831static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7832 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7833 auto *BaseLd = cast<LoadSDNode>(Elt);
7834 if (!BaseLd->isSimple())
7835 return false;
7836 Ld = BaseLd;
7837 ByteOffset = 0;
7838 return true;
7839 }
7840
7841 switch (Elt.getOpcode()) {
7842 case ISD::BITCAST:
7843 case ISD::TRUNCATE:
7844 case ISD::SCALAR_TO_VECTOR:
7845 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7846 case ISD::SRL:
7847 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
7848 uint64_t Idx = Elt.getConstantOperandVal(1);
7849 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7850 ByteOffset += Idx / 8;
7851 return true;
7852 }
7853 }
7854 break;
7855 case ISD::EXTRACT_VECTOR_ELT:
7856 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
7857 SDValue Src = Elt.getOperand(0);
7858 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7859 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7860 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7861 findEltLoadSrc(Src, Ld, ByteOffset)) {
7862 uint64_t Idx = Elt.getConstantOperandVal(1);
7863 ByteOffset += Idx * (SrcSizeInBits / 8);
7864 return true;
7865 }
7866 }
7867 break;
7868 }
7869
7870 return false;
7871}
7872
7873/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7874/// elements can be replaced by a single large load which has the same value as
7875/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7876///
7877/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7878static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
7879 const SDLoc &DL, SelectionDAG &DAG,
7880 const X86Subtarget &Subtarget,
7881 bool isAfterLegalize) {
7882 if ((VT.getScalarSizeInBits() % 8) != 0)
7883 return SDValue();
7884
7885 unsigned NumElems = Elts.size();
7886
7887 int LastLoadedElt = -1;
7888 APInt LoadMask = APInt::getNullValue(NumElems);
7889 APInt ZeroMask = APInt::getNullValue(NumElems);
7890 APInt UndefMask = APInt::getNullValue(NumElems);
7891
7892 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7893 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7894
7895 // For each element in the initializer, see if we've found a load, zero or an
7896 // undef.
7897 for (unsigned i = 0; i < NumElems; ++i) {
7898 SDValue Elt = peekThroughBitcasts(Elts[i]);
7899 if (!Elt.getNode())
7900 return SDValue();
7901 if (Elt.isUndef()) {
7902 UndefMask.setBit(i);
7903 continue;
7904 }
7905 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
7906 ZeroMask.setBit(i);
7907 continue;
7908 }
7909
7910 // Each loaded element must be the correct fractional portion of the
7911 // requested vector load.
7912 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7913 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7914 return SDValue();
7915
7916 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7917 return SDValue();
7918 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7919 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7920 return SDValue();
7921
7922 LoadMask.setBit(i);
7923 LastLoadedElt = i;
7924 }
7925 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7927, __PRETTY_FUNCTION__))
7926 LoadMask.countPopulation()) == NumElems &&(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7927, __PRETTY_FUNCTION__))
7927 "Incomplete element masks")(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7927, __PRETTY_FUNCTION__))
;
7928
7929 // Handle Special Cases - all undef or undef/zero.
7930 if (UndefMask.countPopulation() == NumElems)
7931 return DAG.getUNDEF(VT);
7932
7933 // FIXME: Should we return this as a BUILD_VECTOR instead?
7934 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
7935 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7936 : DAG.getConstantFP(0.0, DL, VT);
7937
7938 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7939 int FirstLoadedElt = LoadMask.countTrailingZeros();
7940 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7941 EVT EltBaseVT = EltBase.getValueType();
7942 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7943, __PRETTY_FUNCTION__))
7943 "Register/Memory size mismatch")((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7943, __PRETTY_FUNCTION__))
;
7944 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7945 assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads"
) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7945, __PRETTY_FUNCTION__))
;
7946 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7947 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7948 int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
7949 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"
) ? static_cast<void> (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7949, __PRETTY_FUNCTION__))
;
7950
7951 // TODO: Support offsetting the base load.
7952 if (ByteOffsets[FirstLoadedElt] != 0)
7953 return SDValue();
7954
7955 // Check to see if the element's load is consecutive to the base load
7956 // or offset from a previous (already checked) load.
7957 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7958 LoadSDNode *Ld = Loads[EltIdx];
7959 int64_t ByteOffset = ByteOffsets[EltIdx];
7960 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7961 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7962 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7963 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7964 }
7965 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7966 EltIdx - FirstLoadedElt);
7967 };
7968
7969 // Consecutive loads can contain UNDEFS but not ZERO elements.
7970 // Consecutive loads with UNDEFs and ZEROs elements require a
7971 // an additional shuffle stage to clear the ZERO elements.
7972 bool IsConsecutiveLoad = true;
7973 bool IsConsecutiveLoadWithZeros = true;
7974 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7975 if (LoadMask[i]) {
7976 if (!CheckConsecutiveLoad(LDBase, i)) {
7977 IsConsecutiveLoad = false;
7978 IsConsecutiveLoadWithZeros = false;
7979 break;
7980 }
7981 } else if (ZeroMask[i]) {
7982 IsConsecutiveLoad = false;
7983 }
7984 }
7985
7986 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7987 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7988 assert(LDBase->isSimple() &&((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7989, __PRETTY_FUNCTION__))
7989 "Cannot merge volatile or atomic loads.")((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7989, __PRETTY_FUNCTION__))
;
7990 SDValue NewLd =
7991 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7992 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
7993 for (auto *LD : Loads)
7994 if (LD)
7995 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7996 return NewLd;
7997 };
7998
7999 // Check if the base load is entirely dereferenceable.
8000 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8001 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8002
8003 // LOAD - all consecutive load/undefs (must start/end with a load or be
8004 // entirely dereferenceable). If we have found an entire vector of loads and
8005 // undefs, then return a large load of the entire vector width starting at the
8006 // base pointer. If the vector contains zeros, then attempt to shuffle those
8007 // elements.
8008 if (FirstLoadedElt == 0 &&
8009 (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
8010 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8011 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8012 return SDValue();
8013
8014 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8015 // will lower to regular temporal loads and use the cache.
8016 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8017 VT.is256BitVector() && !Subtarget.hasInt256())
8018 return SDValue();
8019
8020 if (NumElems == 1)
8021 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8022
8023 if (!ZeroMask)
8024 return CreateLoad(VT, LDBase);
8025
8026 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8027 // vector and a zero vector to clear out the zero elements.
8028 if (!isAfterLegalize && VT.isVector()) {
8029 unsigned NumMaskElts = VT.getVectorNumElements();
8030 if ((NumMaskElts % NumElems) == 0) {
8031 unsigned Scale = NumMaskElts / NumElems;
8032 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8033 for (unsigned i = 0; i < NumElems; ++i) {
8034 if (UndefMask[i])
8035 continue;
8036 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8037 for (unsigned j = 0; j != Scale; ++j)
8038 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8039 }
8040 SDValue V = CreateLoad(VT, LDBase);
8041 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8042 : DAG.getConstantFP(0.0, DL, VT);
8043 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8044 }
8045 }
8046 }
8047
8048 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8049 if (VT.is256BitVector() || VT.is512BitVector()) {
8050 unsigned HalfNumElems = NumElems / 2;
8051 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8052 EVT HalfVT =
8053 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8054 SDValue HalfLD =
8055 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8056 DAG, Subtarget, isAfterLegalize);
8057 if (HalfLD)
8058 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8059 HalfLD, DAG.getIntPtrConstant(0, DL));
8060 }
8061 }
8062
8063 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8064 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8065 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8066 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8067 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8068 : MVT::getIntegerVT(LoadSizeInBits);
8069 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8070 if (TLI.isTypeLegal(VecVT)) {
8071 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8072 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8073 SDValue ResNode =
8074 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
8075 LDBase->getPointerInfo(),
8076 LDBase->getAlignment(),
8077 MachineMemOperand::MOLoad);
8078 for (auto *LD : Loads)
8079 if (LD)
8080 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8081 return DAG.getBitcast(VT, ResNode);
8082 }
8083 }
8084
8085 // BROADCAST - match the smallest possible repetition pattern, load that
8086 // scalar/subvector element and then broadcast to the entire vector.
8087 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8088 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8089 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8090 unsigned RepeatSize = SubElems * BaseSizeInBits;
8091 unsigned ScalarSize = std::min(RepeatSize, 64u);
8092 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8093 continue;
8094
8095 bool Match = true;
8096 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8097 for (unsigned i = 0; i != NumElems && Match; ++i) {
8098 if (!LoadMask[i])
8099 continue;
8100 SDValue Elt = peekThroughBitcasts(Elts[i]);
8101 if (RepeatedLoads[i % SubElems].isUndef())
8102 RepeatedLoads[i % SubElems] = Elt;
8103 else
8104 Match &= (RepeatedLoads[i % SubElems] == Elt);
8105 }
8106
8107 // We must have loads at both ends of the repetition.
8108 Match &= !RepeatedLoads.front().isUndef();
8109 Match &= !RepeatedLoads.back().isUndef();
8110 if (!Match)
8111 continue;
8112
8113 EVT RepeatVT =
8114 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8115 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8116 : EVT::getFloatingPointVT(ScalarSize);
8117 if (RepeatSize > ScalarSize)
8118 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8119 RepeatSize / ScalarSize);
8120 EVT BroadcastVT =
8121 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8122 VT.getSizeInBits() / ScalarSize);
8123 if (TLI.isTypeLegal(BroadcastVT)) {
8124 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8125 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8126 unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
8127 : X86ISD::VBROADCAST;
8128 SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
8129 return DAG.getBitcast(VT, Broadcast);
8130 }
8131 }
8132 }
8133 }
8134
8135 return SDValue();
8136}
8137
8138// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8139// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8140// are consecutive, non-overlapping, and in the right order.
8141static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
8142 SelectionDAG &DAG,
8143 const X86Subtarget &Subtarget,
8144 bool isAfterLegalize) {
8145 SmallVector<SDValue, 64> Elts;
8146 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8147 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
8148 Elts.push_back(Elt);
8149 continue;
8150 }
8151 return SDValue();
8152 }
8153 assert(Elts.size() == VT.getVectorNumElements())((Elts.size() == VT.getVectorNumElements()) ? static_cast<
void> (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8153, __PRETTY_FUNCTION__))
;
8154 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8155 isAfterLegalize);
8156}
8157
8158static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8159 unsigned SplatBitSize, LLVMContext &C) {
8160 unsigned ScalarSize = VT.getScalarSizeInBits();
8161 unsigned NumElm = SplatBitSize / ScalarSize;
8162
8163 SmallVector<Constant *, 32> ConstantVec;
8164 for (unsigned i = 0; i < NumElm; i++) {
8165 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8166 Constant *Const;
8167 if (VT.isFloatingPoint()) {
8168 if (ScalarSize == 32) {
8169 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8170 } else {
8171 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8171, __PRETTY_FUNCTION__))
;
8172 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8173 }
8174 } else
8175 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8176 ConstantVec.push_back(Const);
8177 }
8178 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8179}
8180
8181static bool isFoldableUseOfShuffle(SDNode *N) {
8182 for (auto *U : N->uses()) {
8183 unsigned Opc = U->getOpcode();
8184 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8185 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8186 return false;
8187 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8188 return false;
8189 if (isTargetShuffle(Opc))
8190 return true;
8191 if (Opc == ISD::BITCAST) // Ignore bitcasts
8192 return isFoldableUseOfShuffle(U);
8193 if (N->hasOneUse())
8194 return true;
8195 }
8196 return false;
8197}
8198
8199// Check if the current node of build vector is a zero extended vector.
8200// // If so, return the value extended.
8201// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
8202// // NumElt - return the number of zero extended identical values.
8203// // EltType - return the type of the value include the zero extend.
8204static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
8205 unsigned &NumElt, MVT &EltType) {
8206 SDValue ExtValue = Op->getOperand(0);
8207 unsigned NumElts = Op->getNumOperands();
8208 unsigned Delta = NumElts;
8209
8210 for (unsigned i = 1; i < NumElts; i++) {
8211 if (Op->getOperand(i) == ExtValue) {
8212 Delta = i;
8213 break;
8214 }
8215 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
8216 return SDValue();
8217 }
8218 if (!isPowerOf2_32(Delta) || Delta == 1)
8219 return SDValue();
8220
8221 for (unsigned i = Delta; i < NumElts; i++) {
8222 if (i % Delta == 0) {
8223 if (Op->getOperand(i) != ExtValue)
8224 return SDValue();
8225 } else if (!(isNullConstant(Op->getOperand(i)) ||
8226 Op->getOperand(i).isUndef()))
8227 return SDValue();
8228 }
8229 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
8230 unsigned ExtVTSize = EltSize * Delta;
8231 EltType = MVT::getIntegerVT(ExtVTSize);
8232 NumElt = NumElts / Delta;
8233 return ExtValue;
8234}
8235
8236/// Attempt to use the vbroadcast instruction to generate a splat value
8237/// from a splat BUILD_VECTOR which uses:
8238/// a. A single scalar load, or a constant.
8239/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8240///
8241/// The VBROADCAST node is returned when a pattern is found,
8242/// or SDValue() otherwise.
8243static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8244 const X86Subtarget &Subtarget,
8245 SelectionDAG &DAG) {
8246 // VBROADCAST requires AVX.
8247 // TODO: Splats could be generated for non-AVX CPUs using SSE
8248 // instructions, but there's less potential gain for only 128-bit vectors.
8249 if (!Subtarget.hasAVX())
8250 return SDValue();
8251
8252 MVT VT = BVOp->getSimpleValueType(0);
8253 SDLoc dl(BVOp);
8254
8255 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8256, __PRETTY_FUNCTION__))
8256 "Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8256, __PRETTY_FUNCTION__))
;
8257
8258 BitVector UndefElements;
8259 SDValue Ld = BVOp->getSplatValue(&UndefElements);
8260
8261 // Attempt to use VBROADCASTM
8262 // From this paterrn:
8263 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8264 // b. t1 = (build_vector t0 t0)
8265 //
8266 // Create (VBROADCASTM v2i1 X)
8267 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
8268 MVT EltType = VT.getScalarType();
8269 unsigned NumElts = VT.getVectorNumElements();
8270 SDValue BOperand;
8271 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
8272 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
8273 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
8274 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
8275 if (ZeroExtended)
8276 BOperand = ZeroExtended.getOperand(0);
8277 else
8278 BOperand = Ld.getOperand(0).getOperand(0);
8279 MVT MaskVT = BOperand.getSimpleValueType();
8280 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8281 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8282 SDValue Brdcst =
8283 DAG.getNode(X86ISD::VBROADCASTM, dl,
8284 MVT::getVectorVT(EltType, NumElts), BOperand);
8285 return DAG.getBitcast(VT, Brdcst);
8286 }
8287 }
8288 }
8289
8290 unsigned NumElts = VT.getVectorNumElements();
8291 unsigned NumUndefElts = UndefElements.count();
8292 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8293 APInt SplatValue, Undef;
8294 unsigned SplatBitSize;
8295 bool HasUndef;
8296 // Check if this is a repeated constant pattern suitable for broadcasting.
8297 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8298 SplatBitSize > VT.getScalarSizeInBits() &&
8299 SplatBitSize < VT.getSizeInBits()) {
8300 // Avoid replacing with broadcast when it's a use of a shuffle
8301 // instruction to preserve the present custom lowering of shuffles.
8302 if (isFoldableUseOfShuffle(BVOp))
8303 return SDValue();
8304 // replace BUILD_VECTOR with broadcast of the repeated constants.
8305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8306 LLVMContext *Ctx = DAG.getContext();
8307 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8308 if (Subtarget.hasAVX()) {
8309 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
8310 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
8311 // Splatted value can fit in one INTEGER constant in constant pool.
8312 // Load the constant and broadcast it.
8313 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8314 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8315 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8316 SDValue CP = DAG.getConstantPool(C, PVT);
8317 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8318
8319 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8320 Ld = DAG.getLoad(
8321 CVT, dl, DAG.getEntryNode(), CP,
8322 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8323 Alignment);
8324 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
8325 MVT::getVectorVT(CVT, Repeat), Ld);
8326 return DAG.getBitcast(VT, Brdcst);
8327 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
8328 // Splatted value can fit in one FLOAT constant in constant pool.
8329 // Load the constant and broadcast it.
8330 // AVX have support for 32 and 64 bit broadcast for floats only.
8331 // No 64bit integer in 32bit subtarget.
8332 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
8333 // Lower the splat via APFloat directly, to avoid any conversion.
8334 Constant *C =
8335 SplatBitSize == 32
8336 ? ConstantFP::get(*Ctx,
8337 APFloat(APFloat::IEEEsingle(), SplatValue))
8338 : ConstantFP::get(*Ctx,
8339 APFloat(APFloat::IEEEdouble(), SplatValue));
8340 SDValue CP = DAG.getConstantPool(C, PVT);
8341 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8342
8343 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8344 Ld = DAG.getLoad(
8345 CVT, dl, DAG.getEntryNode(), CP,
8346 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8347 Alignment);
8348 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
8349 MVT::getVectorVT(CVT, Repeat), Ld);
8350 return DAG.getBitcast(VT, Brdcst);
8351 } else if (SplatBitSize > 64) {
8352 // Load the vector of constants and broadcast it.
8353 MVT CVT = VT.getScalarType();
8354 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8355 *Ctx);
8356 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8357 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8358 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
8359 Ld = DAG.getLoad(
8360 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
8361 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8362 Alignment);
8363 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
8364 return DAG.getBitcast(VT, Brdcst);
8365 }
8366 }
8367 }
8368
8369 // If we are moving a scalar into a vector (Ld must be set and all elements
8370 // but 1 are undef) and that operation is not obviously supported by
8371 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8372 // That's better than general shuffling and may eliminate a load to GPR and
8373 // move from scalar to vector register.
8374 if (!Ld || NumElts - NumUndefElts != 1)
8375 return SDValue();
8376 unsigned ScalarSize = Ld.getValueSizeInBits();
8377 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8378 return SDValue();
8379 }
8380
8381 bool ConstSplatVal =
8382 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8383
8384 // Make sure that all of the users of a non-constant load are from the
8385 // BUILD_VECTOR node.
8386 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
8387 return SDValue();
8388
8389 unsigned ScalarSize = Ld.getValueSizeInBits();
8390 bool IsGE256 = (VT.getSizeInBits() >= 256);
8391
8392 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8393 // instruction to save 8 or more bytes of constant pool data.
8394 // TODO: If multiple splats are generated to load the same constant,
8395 // it may be detrimental to overall size. There needs to be a way to detect
8396 // that condition to know if this is truly a size win.
8397 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
8398
8399 // Handle broadcasting a single constant scalar from the constant pool
8400 // into a vector.
8401 // On Sandybridge (no AVX2), it is still better to load a constant vector
8402 // from the constant pool and not to broadcast it from a scalar.
8403 // But override that restriction when optimizing for size.
8404 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8405 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8406 EVT CVT = Ld.getValueType();
8407 assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8407, __PRETTY_FUNCTION__))
;
8408
8409 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8410 // For size optimization, also splat v2f64 and v2i64, and for size opt
8411 // with AVX2, also splat i8 and i16.
8412 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8413 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8414 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8415 const Constant *C = nullptr;
8416 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8417 C = CI->getConstantIntValue();
8418 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8419 C = CF->getConstantFPValue();
8420
8421 assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8421, __PRETTY_FUNCTION__))
;
8422
8423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8424 SDValue CP =
8425 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
8426 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8427 Ld = DAG.getLoad(
8428 CVT, dl, DAG.getEntryNode(), CP,
8429 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8430 Alignment);
8431
8432 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8433 }
8434 }
8435
8436 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8437
8438 // Handle AVX2 in-register broadcasts.
8439 if (!IsLoad && Subtarget.hasInt256() &&
8440 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8441 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8442
8443 // The scalar source must be a normal load.
8444 if (!IsLoad)
8445 return SDValue();
8446
8447 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8448 (Subtarget.hasVLX() && ScalarSize == 64))
8449 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8450
8451 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8452 // double since there is no vbroadcastsd xmm
8453 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
8454 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
8455 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8456 }
8457
8458 // Unsupported broadcast.
8459 return SDValue();
8460}
8461
8462/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8463/// underlying vector and index.
8464///
8465/// Modifies \p ExtractedFromVec to the real vector and returns the real
8466/// index.
8467static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8468 SDValue ExtIdx) {
8469 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
8470 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8471 return Idx;
8472
8473 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8474 // lowered this:
8475 // (extract_vector_elt (v8f32 %1), Constant<6>)
8476 // to:
8477 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8478 // (extract_subvector (v8f32 %0), Constant<4>),
8479 // undef)
8480 // Constant<0>)
8481 // In this case the vector is the extract_subvector expression and the index
8482 // is 2, as specified by the shuffle.
8483 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8484 SDValue ShuffleVec = SVOp->getOperand(0);
8485 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8486 assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8487, __PRETTY_FUNCTION__))
8487 ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8487, __PRETTY_FUNCTION__))
;
8488
8489 int ShuffleIdx = SVOp->getMaskElt(Idx);
8490 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8491 ExtractedFromVec = ShuffleVec;
8492 return ShuffleIdx;
8493 }
8494 return Idx;
8495}
8496
8497static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
8498 MVT VT = Op.getSimpleValueType();
8499
8500 // Skip if insert_vec_elt is not supported.
8501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8502 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
8503 return SDValue();
8504
8505 SDLoc DL(Op);
8506 unsigned NumElems = Op.getNumOperands();
8507
8508 SDValue VecIn1;
8509 SDValue VecIn2;
8510 SmallVector<unsigned, 4> InsertIndices;
8511 SmallVector<int, 8> Mask(NumElems, -1);
8512
8513 for (unsigned i = 0; i != NumElems; ++i) {
8514 unsigned Opc = Op.getOperand(i).getOpcode();
8515
8516 if (Opc == ISD::UNDEF)
8517 continue;
8518
8519 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
8520 // Quit if more than 1 elements need inserting.
8521 if (InsertIndices.size() > 1)
8522 return SDValue();
8523
8524 InsertIndices.push_back(i);
8525 continue;
8526 }
8527
8528 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8529 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8530
8531 // Quit if non-constant index.
8532 if (!isa<ConstantSDNode>(ExtIdx))
8533 return SDValue();
8534 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8535
8536 // Quit if extracted from vector of different type.
8537 if (ExtractedFromVec.getValueType() != VT)
8538 return SDValue();
8539
8540 if (!VecIn1.getNode())
8541 VecIn1 = ExtractedFromVec;
8542 else if (VecIn1 != ExtractedFromVec) {
8543 if (!VecIn2.getNode())
8544 VecIn2 = ExtractedFromVec;
8545 else if (VecIn2 != ExtractedFromVec)
8546 // Quit if more than 2 vectors to shuffle
8547 return SDValue();
8548 }
8549
8550 if (ExtractedFromVec == VecIn1)
8551 Mask[i] = Idx;
8552 else if (ExtractedFromVec == VecIn2)
8553 Mask[i] = Idx + NumElems;
8554 }
8555
8556 if (!VecIn1.getNode())
8557 return SDValue();
8558
8559 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
8560 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8561
8562 for (unsigned Idx : InsertIndices)
8563 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8564 DAG.getIntPtrConstant(Idx, DL));
8565
8566 return NV;
8567}
8568
8569static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
8570 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"
) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8572, __PRETTY_FUNCTION__))
8571 Op.getScalarValueSizeInBits() == 1 &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"
) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8572, __PRETTY_FUNCTION__))
8572 "Can not convert non-constant vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"
) ? static_cast<void> (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && \"Can not convert non-constant vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8572, __PRETTY_FUNCTION__))
;
8573 uint64_t Immediate = 0;
8574 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8575 SDValue In = Op.getOperand(idx);
8576 if (!In.isUndef())
8577 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
8578 }
8579 SDLoc dl(Op);
8580 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
8581 return DAG.getConstant(Immediate, dl, VT);
8582}
8583// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8584static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
8585 const X86Subtarget &Subtarget) {
8586
8587 MVT VT = Op.getSimpleValueType();
8588 assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8589, __PRETTY_FUNCTION__))
8589 "Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8589, __PRETTY_FUNCTION__))
;
8590
8591 SDLoc dl(Op);
8592 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8593 ISD::isBuildVectorAllOnes(Op.getNode()))
8594 return Op;
8595
8596 uint64_t Immediate = 0;
8597 SmallVector<unsigned, 16> NonConstIdx;
8598 bool IsSplat = true;
8599 bool HasConstElts = false;
8600 int SplatIdx = -1;
8601 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8602 SDValue In = Op.getOperand(idx);
8603 if (In.isUndef())
8604 continue;
8605 if (!isa<ConstantSDNode>(In))
8606 NonConstIdx.push_back(idx);
8607 else {
8608 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
8609 HasConstElts = true;
8610 }
8611 if (SplatIdx < 0)
8612 SplatIdx = idx;
8613 else if (In != Op.getOperand(SplatIdx))
8614 IsSplat = false;
8615 }
8616
8617 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8618 if (IsSplat) {
8619 // The build_vector allows the scalar element to be larger than the vector
8620 // element type. We need to mask it to use as a condition unless we know
8621 // the upper bits are zero.
8622 // FIXME: Use computeKnownBits instead of checking specific opcode?
8623 SDValue Cond = Op.getOperand(SplatIdx);
8624 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((Cond.getValueType() == MVT::i8 && "Unexpected VT!")
? static_cast<void> (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8624, __PRETTY_FUNCTION__))
;
8625 if (Cond.getOpcode() != ISD::SETCC)
8626 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8627 DAG.getConstant(1, dl, MVT::i8));
8628 return DAG.getSelect(dl, VT, Cond,
8629 DAG.getConstant(1, dl, VT),
8630 DAG.getConstant(0, dl, VT));
8631 }
8632
8633 // insert elements one by one
8634 SDValue DstVec;
8635 if (HasConstElts) {
8636 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8637 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8638 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8639 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8640 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8641 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8642 } else {
8643 MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
8644 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8645 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8646 DstVec = DAG.getBitcast(VecVT, Imm);
8647 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8648 DAG.getIntPtrConstant(0, dl));
8649 }
8650 } else
8651 DstVec = DAG.getUNDEF(VT);
8652
8653 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
8654 unsigned InsertIdx = NonConstIdx[i];
8655 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8656 Op.getOperand(InsertIdx),
8657 DAG.getIntPtrConstant(InsertIdx, dl));
8658 }
8659 return DstVec;
8660}
8661
8662/// This is a helper function of LowerToHorizontalOp().
8663/// This function checks that the build_vector \p N in input implements a
8664/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8665/// may not match the layout of an x86 256-bit horizontal instruction.
8666/// In other words, if this returns true, then some extraction/insertion will
8667/// be required to produce a valid horizontal instruction.
8668///
8669/// Parameter \p Opcode defines the kind of horizontal operation to match.
8670/// For example, if \p Opcode is equal to ISD::ADD, then this function
8671/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8672/// is equal to ISD::SUB, then this function checks if this is a horizontal
8673/// arithmetic sub.
8674///
8675/// This function only analyzes elements of \p N whose indices are
8676/// in range [BaseIdx, LastIdx).
8677///
8678/// TODO: This function was originally used to match both real and fake partial
8679/// horizontal operations, but the index-matching logic is incorrect for that.
8680/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8681/// code because it is only used for partial h-op matching now?
8682static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8683 SelectionDAG &DAG,
8684 unsigned BaseIdx, unsigned LastIdx,
8685 SDValue &V0, SDValue &V1) {
8686 EVT VT = N->getValueType(0);
8687 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8687, __PRETTY_FUNCTION__))
;
8688 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8688, __PRETTY_FUNCTION__))
;
8689 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8690, __PRETTY_FUNCTION__))
8690 "Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8690, __PRETTY_FUNCTION__))
;
8691
8692 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8693 bool CanFold = true;
8694 unsigned ExpectedVExtractIdx = BaseIdx;
8695 unsigned NumElts = LastIdx - BaseIdx;
8696 V0 = DAG.getUNDEF(VT);
8697 V1 = DAG.getUNDEF(VT);
8698
8699 // Check if N implements a horizontal binop.
8700 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8701 SDValue Op = N->getOperand(i + BaseIdx);
8702
8703 // Skip UNDEFs.
8704 if (Op->isUndef()) {
8705 // Update the expected vector extract index.
8706 if (i * 2 == NumElts)
8707 ExpectedVExtractIdx = BaseIdx;
8708 ExpectedVExtractIdx += 2;
8709 continue;
8710 }
8711
8712 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8713
8714 if (!CanFold)
8715 break;
8716
8717 SDValue Op0 = Op.getOperand(0);
8718 SDValue Op1 = Op.getOperand(1);
8719
8720 // Try to match the following pattern:
8721 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8722 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8723 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8724 Op0.getOperand(0) == Op1.getOperand(0) &&
8725 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8726 isa<ConstantSDNode>(Op1.getOperand(1)));
8727 if (!CanFold)
8728 break;
8729
8730 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
8731 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
8732
8733 if (i * 2 < NumElts) {
8734 if (V0.isUndef()) {
8735 V0 = Op0.getOperand(0);
8736 if (V0.getValueType() != VT)
8737 return false;
8738 }
8739 } else {
8740 if (V1.isUndef()) {
8741 V1 = Op0.getOperand(0);
8742 if (V1.getValueType() != VT)
8743 return false;
8744 }
8745 if (i * 2 == NumElts)
8746 ExpectedVExtractIdx = BaseIdx;
8747 }
8748
8749 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8750 if (I0 == ExpectedVExtractIdx)
8751 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8752 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8753 // Try to match the following dag sequence:
8754 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8755 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8756 } else
8757 CanFold = false;
8758
8759 ExpectedVExtractIdx += 2;
8760 }
8761
8762 return CanFold;
8763}
8764
8765/// Emit a sequence of two 128-bit horizontal add/sub followed by
8766/// a concat_vector.
8767///
8768/// This is a helper function of LowerToHorizontalOp().
8769/// This function expects two 256-bit vectors called V0 and V1.
8770/// At first, each vector is split into two separate 128-bit vectors.
8771/// Then, the resulting 128-bit vectors are used to implement two
8772/// horizontal binary operations.
8773///
8774/// The kind of horizontal binary operation is defined by \p X86Opcode.
8775///
8776/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8777/// the two new horizontal binop.
8778/// When Mode is set, the first horizontal binop dag node would take as input
8779/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8780/// horizontal binop dag node would take as input the lower 128-bit of V1
8781/// and the upper 128-bit of V1.
8782/// Example:
8783/// HADD V0_LO, V0_HI
8784/// HADD V1_LO, V1_HI
8785///
8786/// Otherwise, the first horizontal binop dag node takes as input the lower
8787/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8788/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8789/// Example:
8790/// HADD V0_LO, V1_LO
8791/// HADD V0_HI, V1_HI
8792///
8793/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8794/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8795/// the upper 128-bits of the result.
8796static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8797 const SDLoc &DL, SelectionDAG &DAG,
8798 unsigned X86Opcode, bool Mode,
8799 bool isUndefLO, bool isUndefHI) {
8800 MVT VT = V0.getSimpleValueType();
8801 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8802, __PRETTY_FUNCTION__))
8802 "Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8802, __PRETTY_FUNCTION__))
;
8803
8804 unsigned NumElts = VT.getVectorNumElements();
8805 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8806 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8807 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8808 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8809 MVT NewVT = V0_LO.getSimpleValueType();
8810
8811 SDValue LO = DAG.getUNDEF(NewVT);
8812 SDValue HI = DAG.getUNDEF(NewVT);
8813
8814 if (Mode) {
8815 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8816 if (!isUndefLO && !V0->isUndef())
8817 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8818 if (!isUndefHI && !V1->isUndef())
8819 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8820 } else {
8821 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8822 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8823 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8824
8825 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8826 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8827 }
8828
8829 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8830}
8831
8832/// Returns true iff \p BV builds a vector with the result equivalent to
8833/// the result of ADDSUB/SUBADD operation.
8834/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8835/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8836/// \p Opnd0 and \p Opnd1.
8837static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
8838 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8839 SDValue &Opnd0, SDValue &Opnd1,
8840 unsigned &NumExtracts,
8841 bool &IsSubAdd) {
8842
8843 MVT VT = BV->getSimpleValueType(0);
8844 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8845 return false;
8846
8847 unsigned NumElts = VT.getVectorNumElements();
8848 SDValue InVec0 = DAG.getUNDEF(VT);
8849 SDValue InVec1 = DAG.getUNDEF(VT);
8850
8851 NumExtracts = 0;
8852
8853 // Odd-numbered elements in the input build vector are obtained from
8854 // adding/subtracting two integer/float elements.
8855 // Even-numbered elements in the input build vector are obtained from
8856 // subtracting/adding two integer/float elements.
8857 unsigned Opc[2] = {0, 0};
8858 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8859 SDValue Op = BV->getOperand(i);
8860
8861 // Skip 'undef' values.
8862 unsigned Opcode = Op.getOpcode();
8863 if (Opcode == ISD::UNDEF)
8864 continue;
8865
8866 // Early exit if we found an unexpected opcode.
8867 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8868 return false;
8869
8870 SDValue Op0 = Op.getOperand(0);
8871 SDValue Op1 = Op.getOperand(1);
8872
8873 // Try to match the following pattern:
8874 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8875 // Early exit if we cannot match that sequence.
8876 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8877 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8878 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8879 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
8880 Op0.getOperand(1) != Op1.getOperand(1))
8881 return false;
8882
8883 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
8884 if (I0 != i)
8885 return false;
8886
8887 // We found a valid add/sub node, make sure its the same opcode as previous
8888 // elements for this parity.
8889 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8890 return false;
8891 Opc[i % 2] = Opcode;
8892
8893 // Update InVec0 and InVec1.
8894 if (InVec0.isUndef()) {
8895 InVec0 = Op0.getOperand(0);
8896 if (InVec0.getSimpleValueType() != VT)
8897 return false;
8898 }
8899 if (InVec1.isUndef()) {
8900 InVec1 = Op1.getOperand(0);
8901 if (InVec1.getSimpleValueType() != VT)
8902 return false;
8903 }
8904
8905 // Make sure that operands in input to each add/sub node always
8906 // come from a same pair of vectors.
8907 if (InVec0 != Op0.getOperand(0)) {
8908 if (Opcode == ISD::FSUB)
8909 return false;
8910
8911 // FADD is commutable. Try to commute the operands
8912 // and then test again.
8913 std::swap(Op0, Op1);
8914 if (InVec0 != Op0.getOperand(0))
8915 return false;
8916 }
8917
8918 if (InVec1 != Op1.getOperand(0))
8919 return false;
8920
8921 // Increment the number of extractions done.
8922 ++NumExtracts;
8923 }
8924
8925 // Ensure we have found an opcode for both parities and that they are
8926 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8927 // inputs are undef.
8928 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8929 InVec0.isUndef() || InVec1.isUndef())
8930 return false;
8931
8932 IsSubAdd = Opc[0] == ISD::FADD;
8933
8934 Opnd0 = InVec0;
8935 Opnd1 = InVec1;
8936 return true;
8937}
8938
8939/// Returns true if is possible to fold MUL and an idiom that has already been
8940/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8941/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8942/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8943///
8944/// Prior to calling this function it should be known that there is some
8945/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8946/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8947/// before replacement of such SDNode with ADDSUB operation. Thus the number
8948/// of \p Opnd0 uses is expected to be equal to 2.
8949/// For example, this function may be called for the following IR:
8950/// %AB = fmul fast <2 x double> %A, %B
8951/// %Sub = fsub fast <2 x double> %AB, %C
8952/// %Add = fadd fast <2 x double> %AB, %C
8953/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8954/// <2 x i32> <i32 0, i32 3>
8955/// There is a def for %Addsub here, which potentially can be replaced by
8956/// X86ISD::ADDSUB operation:
8957/// %Addsub = X86ISD::ADDSUB %AB, %C
8958/// and such ADDSUB can further be replaced with FMADDSUB:
8959/// %Addsub = FMADDSUB %A, %B, %C.
8960///
8961/// The main reason why this method is called before the replacement of the
8962/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8963/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8964/// FMADDSUB is.
8965static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8966 SelectionDAG &DAG,
8967 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8968 unsigned ExpectedUses) {
8969 if (Opnd0.getOpcode() != ISD::FMUL ||
8970 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8971 return false;
8972
8973 // FIXME: These checks must match the similar ones in
8974 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8975 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8976 // or MUL + ADDSUB to FMADDSUB.
8977 const TargetOptions &Options = DAG.getTarget().Options;
8978 bool AllowFusion =
8979 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8980 if (!AllowFusion)
8981 return false;
8982
8983 Opnd2 = Opnd1;
8984 Opnd1 = Opnd0.getOperand(1);
8985 Opnd0 = Opnd0.getOperand(0);
8986
8987 return true;
8988}
8989
8990/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8991/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8992/// X86ISD::FMSUBADD node.
8993static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
8994 const X86Subtarget &Subtarget,
8995 SelectionDAG &DAG) {
8996 SDValue Opnd0, Opnd1;
8997 unsigned NumExtracts;
8998 bool IsSubAdd;
8999 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9000 IsSubAdd))
9001 return SDValue();
9002
9003 MVT VT = BV->getSimpleValueType(0);
9004 SDLoc DL(BV);
9005
9006 // Try to generate X86ISD::FMADDSUB node here.
9007 SDValue Opnd2;
9008 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9009 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9010 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9011 }
9012
9013 // We only support ADDSUB.
9014 if (IsSubAdd)
9015 return SDValue();
9016
9017 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9018 // the ADDSUB idiom has been successfully recognized. There are no known
9019 // X86 targets with 512-bit ADDSUB instructions!
9020 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9021 // recognition.
9022 if (VT.is512BitVector())
9023 return SDValue();
9024
9025 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9026}
9027
9028static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9029 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9030 // Initialize outputs to known values.
9031 MVT VT = BV->getSimpleValueType(0);
9032 HOpcode = ISD::DELETED_NODE;
9033 V0 = DAG.getUNDEF(VT);
9034 V1 = DAG.getUNDEF(VT);
9035
9036 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9037 // half of the result is calculated independently from the 128-bit halves of
9038 // the inputs, so that makes the index-checking logic below more complicated.
9039 unsigned NumElts = VT.getVectorNumElements();
9040 unsigned GenericOpcode = ISD::DELETED_NODE;
9041 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9042 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9043 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9044 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9045 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9046 // Ignore undef elements.
9047 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9048 if (Op.isUndef())
9049 continue;
9050
9051 // If there's an opcode mismatch, we're done.
9052 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9053 return false;
9054
9055 // Initialize horizontal opcode.
9056 if (HOpcode == ISD::DELETED_NODE) {
9057 GenericOpcode = Op.getOpcode();
9058 switch (GenericOpcode) {
9059 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9060 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9061 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9062 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9063 default: return false;
9064 }
9065 }
9066
9067 SDValue Op0 = Op.getOperand(0);
9068 SDValue Op1 = Op.getOperand(1);
9069 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9070 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9071 Op0.getOperand(0) != Op1.getOperand(0) ||
9072 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9073 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9074 return false;
9075
9076 // The source vector is chosen based on which 64-bit half of the
9077 // destination vector is being calculated.
9078 if (j < NumEltsIn64Bits) {
9079 if (V0.isUndef())
9080 V0 = Op0.getOperand(0);
9081 } else {
9082 if (V1.isUndef())
9083 V1 = Op0.getOperand(0);
9084 }
9085
9086 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9087 if (SourceVec != Op0.getOperand(0))
9088 return false;
9089
9090 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9091 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9092 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9093 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9094 (j % NumEltsIn64Bits) * 2;
9095 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9096 continue;
9097
9098 // If this is not a commutative op, this does not match.
9099 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9100 return false;
9101
9102 // Addition is commutative, so try swapping the extract indexes.
9103 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9104 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9105 continue;
9106
9107 // Extract indexes do not match horizontal requirement.
9108 return false;
9109 }
9110 }
9111 // We matched. Opcode and operands are returned by reference as arguments.
9112 return true;
9113}
9114
9115static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9116 SelectionDAG &DAG, unsigned HOpcode,
9117 SDValue V0, SDValue V1) {
9118 // If either input vector is not the same size as the build vector,
9119 // extract/insert the low bits to the correct size.
9120 // This is free (examples: zmm --> xmm, xmm --> ymm).
9121 MVT VT = BV->getSimpleValueType(0);
9122 unsigned Width = VT.getSizeInBits();
9123 if (V0.getValueSizeInBits() > Width)
9124 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9125 else if (V0.getValueSizeInBits() < Width)
9126 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9127
9128 if (V1.getValueSizeInBits() > Width)
9129 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9130 else if (V1.getValueSizeInBits() < Width)
9131 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9132
9133 unsigned NumElts = VT.getVectorNumElements();
9134 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9135 for (unsigned i = 0; i != NumElts; ++i)
9136 if (BV->getOperand(i).isUndef())
9137 DemandedElts.clearBit(i);
9138
9139 // If we don't need the upper xmm, then perform as a xmm hop.
9140 unsigned HalfNumElts = NumElts / 2;
9141 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9142 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9143 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9144 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9145 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9146 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9147 }
9148
9149 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9150}
9151
9152/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9153static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9154 const X86Subtarget &Subtarget,
9155 SelectionDAG &DAG) {
9156 // We need at least 2 non-undef elements to make this worthwhile by default.
9157 unsigned NumNonUndefs =
9158 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9159 if (NumNonUndefs < 2)
9160 return SDValue();
9161
9162 // There are 4 sets of horizontal math operations distinguished by type:
9163 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9164 // subtarget feature. Try to match those "native" patterns first.
9165 MVT VT = BV->getSimpleValueType(0);
9166 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9167 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9168 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9169 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9170 unsigned HOpcode;
9171 SDValue V0, V1;
9172 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9173 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9174 }
9175
9176 // Try harder to match 256-bit ops by using extract/concat.
9177 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9178 return SDValue();
9179
9180 // Count the number of UNDEF operands in the build_vector in input.
9181 unsigned NumElts = VT.getVectorNumElements();
9182 unsigned Half = NumElts / 2;
9183 unsigned NumUndefsLO = 0;
9184 unsigned NumUndefsHI = 0;
9185 for (unsigned i = 0, e = Half; i != e; ++i)
9186 if (BV->getOperand(i)->isUndef())
9187 NumUndefsLO++;
9188
9189 for (unsigned i = Half, e = NumElts; i != e; ++i)
9190 if (BV->getOperand(i)->isUndef())
9191 NumUndefsHI++;
9192
9193 SDLoc DL(BV);
9194 SDValue InVec0, InVec1;
9195 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9196 SDValue InVec2, InVec3;
9197 unsigned X86Opcode;
9198 bool CanFold = true;
9199
9200 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9201 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9202 InVec3) &&
9203 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9204 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9205 X86Opcode = X86ISD::HADD;
9206 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9207 InVec1) &&
9208 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9209 InVec3) &&
9210 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9211 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9212 X86Opcode = X86ISD::HSUB;
9213 else
9214 CanFold = false;
9215
9216 if (CanFold) {
9217 // Do not try to expand this build_vector into a pair of horizontal
9218 // add/sub if we can emit a pair of scalar add/sub.
9219 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9220 return SDValue();
9221
9222 // Convert this build_vector into a pair of horizontal binops followed by
9223 // a concat vector. We must adjust the outputs from the partial horizontal
9224 // matching calls above to account for undefined vector halves.
9225 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9226 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9227 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"
) ? static_cast<void> (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9227, __PRETTY_FUNCTION__))
;
9228 bool isUndefLO = NumUndefsLO == Half;
9229 bool isUndefHI = NumUndefsHI == Half;
9230 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9231 isUndefHI);
9232 }
9233 }
9234
9235 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9236 VT == MVT::v16i16) {
9237 unsigned X86Opcode;
9238 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9239 X86Opcode = X86ISD::HADD;
9240 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9241 InVec1))
9242 X86Opcode = X86ISD::HSUB;
9243 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9244 InVec1))
9245 X86Opcode = X86ISD::FHADD;
9246 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9247 InVec1))
9248 X86Opcode = X86ISD::FHSUB;
9249 else
9250 return SDValue();
9251
9252 // Don't try to expand this build_vector into a pair of horizontal add/sub
9253 // if we can simply emit a pair of scalar add/sub.
9254 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9255 return SDValue();
9256
9257 // Convert this build_vector into two horizontal add/sub followed by
9258 // a concat vector.
9259 bool isUndefLO = NumUndefsLO == Half;
9260 bool isUndefHI = NumUndefsHI == Half;
9261 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9262 isUndefLO, isUndefHI);
9263 }
9264
9265 return SDValue();
9266}
9267
9268/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9269/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9270/// just apply the bit to the vectors.
9271/// NOTE: Its not in our interest to start make a general purpose vectorizer
9272/// from this, but enough scalar bit operations are created from the later
9273/// legalization + scalarization stages to need basic support.
9274static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9275 SelectionDAG &DAG) {
9276 SDLoc DL(Op);
9277 MVT VT = Op->getSimpleValueType(0);
9278 unsigned NumElems = VT.getVectorNumElements();
9279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9280
9281 // Check that all elements have the same opcode.
9282 // TODO: Should we allow UNDEFS and if so how many?
9283 unsigned Opcode = Op->getOperand(0).getOpcode();
9284 for (unsigned i = 1; i < NumElems; ++i)
9285 if (Opcode != Op->getOperand(i).getOpcode())
9286 return SDValue();
9287
9288 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9289 bool IsShift = false;
9290 switch (Opcode) {
9291 default:
9292 return SDValue();
9293 case ISD::SHL:
9294 case ISD::SRL:
9295 case ISD::SRA:
9296 IsShift = true;
9297 break;
9298 case ISD::AND:
9299 case ISD::XOR:
9300 case ISD::OR:
9301 // Don't do this if the buildvector is a splat - we'd replace one
9302 // constant with an entire vector.
9303 if (Op->getSplatValue())
9304 return SDValue();
9305 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9306 return SDValue();
9307 break;
9308 }
9309
9310 SmallVector<SDValue, 4> LHSElts, RHSElts;
9311 for (SDValue Elt : Op->ops()) {
9312 SDValue LHS = Elt.getOperand(0);
9313 SDValue RHS = Elt.getOperand(1);
9314
9315 // We expect the canonicalized RHS operand to be the constant.
9316 if (!isa<ConstantSDNode>(RHS))
9317 return SDValue();
9318
9319 // Extend shift amounts.
9320 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9321 if (!IsShift)
9322 return SDValue();
9323 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9324 }
9325
9326 LHSElts.push_back(LHS);
9327 RHSElts.push_back(RHS);
9328 }
9329
9330 // Limit to shifts by uniform immediates.
9331 // TODO: Only accept vXi8/vXi64 special cases?
9332 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9333 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9334 return SDValue();
9335
9336 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9337 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9338 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
9339}
9340
9341/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9342/// functionality to do this, so it's all zeros, all ones, or some derivation
9343/// that is cheap to calculate.
9344static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9345 const X86Subtarget &Subtarget) {
9346 SDLoc DL(Op);
9347 MVT VT = Op.getSimpleValueType();
9348
9349 // Vectors containing all zeros can be matched by pxor and xorps.
9350 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9351 return Op;
9352
9353 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9354 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9355 // vpcmpeqd on 256-bit vectors.
9356 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9357 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9358 return Op;
9359
9360 return getOnesVector(VT, DAG, DL);
9361 }
9362
9363 return SDValue();
9364}
9365
9366/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9367/// from a vector of source values and a vector of extraction indices.
9368/// The vectors might be manipulated to match the type of the permute op.
9369static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9370 SDLoc &DL, SelectionDAG &DAG,
9371 const X86Subtarget &Subtarget) {
9372 MVT ShuffleVT = VT;
9373 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9374 unsigned NumElts = VT.getVectorNumElements();
9375 unsigned SizeInBits = VT.getSizeInBits();
9376
9377 // Adjust IndicesVec to match VT size.
9378 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9379, __PRETTY_FUNCTION__))
9379 "Illegal variable permute mask size")((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9379, __PRETTY_FUNCTION__))
;
9380 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9381 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9382 NumElts * VT.getScalarSizeInBits());
9383 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9384
9385 // Handle SrcVec that don't match VT type.
9386 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9387 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9388 // Handle larger SrcVec by treating it as a larger permute.
9389 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9390 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9391 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9392 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9393 Subtarget, DAG, SDLoc(IndicesVec));
9394 return extractSubVector(
9395 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
9396 DAG, DL, SizeInBits);
9397 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9398 // Widen smaller SrcVec to match VT.
9399 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9400 } else
9401 return SDValue();
9402 }
9403
9404 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9405 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9405, __PRETTY_FUNCTION__))
;
9406 EVT SrcVT = Idx.getValueType();
9407 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9408 uint64_t IndexScale = 0;
9409 uint64_t IndexOffset = 0;
9410
9411 // If we're scaling a smaller permute op, then we need to repeat the
9412 // indices, scaling and offsetting them as well.
9413 // e.g. v4i32 -> v16i8 (Scale = 4)
9414 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9415 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9416 for (uint64_t i = 0; i != Scale; ++i) {
9417 IndexScale |= Scale << (i * NumDstBits);
9418 IndexOffset |= i << (i * NumDstBits);
9419 }
9420
9421 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9422 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9423 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9424 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9425 return Idx;
9426 };
9427
9428 unsigned Opcode = 0;
9429 switch (VT.SimpleTy) {
9430 default:
9431 break;
9432 case MVT::v16i8:
9433 if (Subtarget.hasSSSE3())
9434 Opcode = X86ISD::PSHUFB;
9435 break;
9436 case MVT::v8i16:
9437 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9438 Opcode = X86ISD::VPERMV;
9439 else if (Subtarget.hasSSSE3()) {
9440 Opcode = X86ISD::PSHUFB;
9441 ShuffleVT = MVT::v16i8;
9442 }
9443 break;
9444 case MVT::v4f32:
9445 case MVT::v4i32:
9446 if (Subtarget.hasAVX()) {
9447 Opcode = X86ISD::VPERMILPV;
9448 ShuffleVT = MVT::v4f32;
9449 } else if (Subtarget.hasSSSE3()) {
9450 Opcode = X86ISD::PSHUFB;
9451 ShuffleVT = MVT::v16i8;
9452 }
9453 break;
9454 case MVT::v2f64:
9455 case MVT::v2i64:
9456 if (Subtarget.hasAVX()) {
9457 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9458 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9459 Opcode = X86ISD::VPERMILPV;
9460 ShuffleVT = MVT::v2f64;
9461 } else if (Subtarget.hasSSE41()) {
9462 // SSE41 can compare v2i64 - select between indices 0 and 1.
9463 return DAG.getSelectCC(
9464 DL, IndicesVec,
9465 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9466 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9467 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9468 ISD::CondCode::SETEQ);
9469 }
9470 break;
9471 case MVT::v32i8:
9472 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9473 Opcode = X86ISD::VPERMV;
9474 else if (Subtarget.hasXOP()) {
9475 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9476 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9477 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9478 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9479 return DAG.getNode(
9480 ISD::CONCAT_VECTORS, DL, VT,
9481 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9482 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9483 } else if (Subtarget.hasAVX()) {
9484 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9485 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9486 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9487 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9488 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9489 ArrayRef<SDValue> Ops) {
9490 // Permute Lo and Hi and then select based on index range.
9491 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9492 // care about the bit[7] as its just an index vector.
9493 SDValue Idx = Ops[2];
9494 EVT VT = Idx.getValueType();
9495 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9496 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9497 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9498 ISD::CondCode::SETGT);
9499 };
9500 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9501 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9502 PSHUFBBuilder);
9503 }
9504 break;
9505 case MVT::v16i16:
9506 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9507 Opcode = X86ISD::VPERMV;
9508 else if (Subtarget.hasAVX()) {
9509 // Scale to v32i8 and perform as v32i8.
9510 IndicesVec = ScaleIndices(IndicesVec, 2);
9511 return DAG.getBitcast(
9512 VT, createVariablePermute(
9513 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9514 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9515 }
9516 break;
9517 case MVT::v8f32:
9518 case MVT::v8i32:
9519 if (Subtarget.hasAVX2())
9520 Opcode = X86ISD::VPERMV;
9521 else if (Subtarget.hasAVX()) {
9522 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9523 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9524 {0, 1, 2, 3, 0, 1, 2, 3});
9525 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9526 {4, 5, 6, 7, 4, 5, 6, 7});
9527 if (Subtarget.hasXOP())
9528 return DAG.getBitcast(
9529 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9530 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9531 // Permute Lo and Hi and then select based on index range.
9532 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9533 SDValue Res = DAG.getSelectCC(
9534 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9535 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9536 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9537 ISD::CondCode::SETGT);
9538 return DAG.getBitcast(VT, Res);
9539 }
9540 break;
9541 case MVT::v4i64:
9542 case MVT::v4f64:
9543 if (Subtarget.hasAVX512()) {
9544 if (!Subtarget.hasVLX()) {
9545 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9546 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9547 SDLoc(SrcVec));
9548 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9549 DAG, SDLoc(IndicesVec));
9550 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9551 DAG, Subtarget);
9552 return extract256BitVector(Res, 0, DAG, DL);
9553 }
9554 Opcode = X86ISD::VPERMV;
9555 } else if (Subtarget.hasAVX()) {
9556 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9557 SDValue LoLo =
9558 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9559 SDValue HiHi =
9560 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9561 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9562 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9563 if (Subtarget.hasXOP())
9564 return DAG.getBitcast(
9565 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9566 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9567 // Permute Lo and Hi and then select based on index range.
9568 // This works as VPERMILPD only uses index bit[1] to permute elements.
9569 SDValue Res = DAG.getSelectCC(
9570 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9571 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9572 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9573 ISD::CondCode::SETGT);
9574 return DAG.getBitcast(VT, Res);
9575 }
9576 break;
9577 case MVT::v64i8:
9578 if (Subtarget.hasVBMI())
9579 Opcode = X86ISD::VPERMV;
9580 break;
9581 case MVT::v32i16:
9582 if (Subtarget.hasBWI())
9583 Opcode = X86ISD::VPERMV;
9584 break;
9585 case MVT::v16f32:
9586 case MVT::v16i32:
9587 case MVT::v8f64:
9588 case MVT::v8i64:
9589 if (Subtarget.hasAVX512())
9590 Opcode = X86ISD::VPERMV;
9591 break;
9592 }
9593 if (!Opcode)
9594 return SDValue();
9595
9596 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9598, __PRETTY_FUNCTION__))
9597 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9598, __PRETTY_FUNCTION__))
9598 "Illegal variable permute shuffle type")(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9598, __PRETTY_FUNCTION__))
;
9599
9600 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9601 if (Scale > 1)
9602 IndicesVec = ScaleIndices(IndicesVec, Scale);
9603
9604 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9605 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9606
9607 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9608 SDValue Res = Opcode == X86ISD::VPERMV
9609 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9610 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9611 return DAG.getBitcast(VT, Res);
9612}
9613
9614// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9615// reasoned to be a permutation of a vector by indices in a non-constant vector.
9616// (build_vector (extract_elt V, (extract_elt I, 0)),
9617// (extract_elt V, (extract_elt I, 1)),
9618// ...
9619// ->
9620// (vpermv I, V)
9621//
9622// TODO: Handle undefs
9623// TODO: Utilize pshufb and zero mask blending to support more efficient
9624// construction of vectors with constant-0 elements.
9625static SDValue
9626LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
9627 const X86Subtarget &Subtarget) {
9628 SDValue SrcVec, IndicesVec;
9629 // Check for a match of the permute source vector and permute index elements.
9630 // This is done by checking that the i-th build_vector operand is of the form:
9631 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9632 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9633 SDValue Op = V.getOperand(Idx);
9634 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9635 return SDValue();
9636
9637 // If this is the first extract encountered in V, set the source vector,
9638 // otherwise verify the extract is from the previously defined source
9639 // vector.
9640 if (!SrcVec)
9641 SrcVec = Op.getOperand(0);
9642 else if (SrcVec != Op.getOperand(0))
9643 return SDValue();
9644 SDValue ExtractedIndex = Op->getOperand(1);
9645 // Peek through extends.
9646 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9647 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9648 ExtractedIndex = ExtractedIndex.getOperand(0);
9649 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9650 return SDValue();
9651
9652 // If this is the first extract from the index vector candidate, set the
9653 // indices vector, otherwise verify the extract is from the previously
9654 // defined indices vector.
9655 if (!IndicesVec)
9656 IndicesVec = ExtractedIndex.getOperand(0);
9657 else if (IndicesVec != ExtractedIndex.getOperand(0))
9658 return SDValue();
9659
9660 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9661 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9662 return SDValue();
9663 }
9664
9665 SDLoc DL(V);
9666 MVT VT = V.getSimpleValueType();
9667 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9668}
9669
9670SDValue
9671X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9672 SDLoc dl(Op);
9673
9674 MVT VT = Op.getSimpleValueType();
9675 MVT EltVT = VT.getVectorElementType();
9676 unsigned NumElems = Op.getNumOperands();
9677
9678 // Generate vectors for predicate vectors.
9679 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9680 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
9681
9682 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
9683 return VectorConstant;
9684
9685 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9686 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
9687 return AddSub;
9688 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
9689 return HorizontalOp;
9690 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
9691 return Broadcast;
9692 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
9693 return BitOp;
9694
9695 unsigned EVTBits = EltVT.getSizeInBits();
9696
9697 unsigned NumZero = 0;
9698 unsigned NumNonZero = 0;
9699 uint64_t NonZeros = 0;
9700 bool IsAllConstants = true;
9701 SmallSet<SDValue, 8> Values;
9702 unsigned NumConstants = NumElems;
9703 for (unsigned i = 0; i < NumElems; ++i) {
9704 SDValue Elt = Op.getOperand(i);
9705 if (Elt.isUndef())
9706 continue;
9707 Values.insert(Elt);
9708 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
9709 IsAllConstants = false;
9710 NumConstants--;
9711 }
9712 if (X86::isZeroNode(Elt))
9713 NumZero++;
9714 else {
9715 assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9715, __PRETTY_FUNCTION__))
; // Make sure the shift is within range.
9716 NonZeros |= ((uint64_t)1 << i);
9717 NumNonZero++;
9718 }
9719 }
9720
9721 // All undef vector. Return an UNDEF. All zero vectors were handled above.
9722 if (NumNonZero == 0)
9723 return DAG.getUNDEF(VT);
9724
9725 // If we are inserting one variable into a vector of non-zero constants, try
9726 // to avoid loading each constant element as a scalar. Load the constants as a
9727 // vector and then insert the variable scalar element. If insertion is not
9728 // supported, fall back to a shuffle to get the scalar blended with the
9729 // constants. Insertion into a zero vector is handled as a special-case
9730 // somewhere below here.
9731 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9732 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
9733 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
9734 // Create an all-constant vector. The variable element in the old
9735 // build vector is replaced by undef in the constant vector. Save the
9736 // variable scalar element and its index for use in the insertelement.
9737 LLVMContext &Context = *DAG.getContext();
9738 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9739 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9740 SDValue VarElt;
9741 SDValue InsIndex;
9742 for (unsigned i = 0; i != NumElems; ++i) {
9743 SDValue Elt = Op.getOperand(i);
9744 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9745 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9746 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9747 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9748 else if (!Elt.isUndef()) {
9749 assert(!VarElt.getNode() && !InsIndex.getNode() &&((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9750, __PRETTY_FUNCTION__))
9750 "Expected one variable element in this vector")((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9750, __PRETTY_FUNCTION__))
;
9751 VarElt = Elt;
9752 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
9753 }
9754 }
9755 Constant *CV = ConstantVector::get(ConstVecOps);
9756 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9757
9758 // The constants we just created may not be legal (eg, floating point). We
9759 // must lower the vector right here because we can not guarantee that we'll
9760 // legalize it before loading it. This is also why we could not just create
9761 // a new build vector here. If the build vector contains illegal constants,
9762 // it could get split back up into a series of insert elements.
9763 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9764 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9765 MachineFunction &MF = DAG.getMachineFunction();
9766 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9767 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9768 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
9769 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9770 if (InsertC < NumEltsInLow128Bits)
9771 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9772
9773 // There's no good way to insert into the high elements of a >128-bit
9774 // vector, so use shuffles to avoid an extract/insert sequence.
9775 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((VT.getSizeInBits() > 128 && "Invalid insertion index?"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9775, __PRETTY_FUNCTION__))
;
9776 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9776, __PRETTY_FUNCTION__))
;
9777 SmallVector<int, 8> ShuffleMask;
9778 unsigned NumElts = VT.getVectorNumElements();
9779 for (unsigned i = 0; i != NumElts; ++i)
9780 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9781 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9782 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9783 }
9784
9785 // Special case for single non-zero, non-undef, element.
9786 if (NumNonZero == 1) {
9787 unsigned Idx = countTrailingZeros(NonZeros);
9788 SDValue Item = Op.getOperand(Idx);
9789
9790 // If we have a constant or non-constant insertion into the low element of
9791 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9792 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9793 // depending on what the source datatype is.
9794 if (Idx == 0) {
9795 if (NumZero == 0)
9796 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9797
9798 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
9799 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
9800 assert((VT.is128BitVector() || VT.is256BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9802, __PRETTY_FUNCTION__))
9801 VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9802, __PRETTY_FUNCTION__))
9802 "Expected an SSE value type!")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9802, __PRETTY_FUNCTION__))
;
9803 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9804 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
9805 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9806 }
9807
9808 // We can't directly insert an i8 or i16 into a vector, so zero extend
9809 // it to i32 first.
9810 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9811 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9812 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
9813 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9814 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9815 return DAG.getBitcast(VT, Item);
9816 }
9817 }
9818
9819 // Is it a vector logical left shift?
9820 if (NumElems == 2 && Idx == 1 &&
9821 X86::isZeroNode(Op.getOperand(0)) &&
9822 !X86::isZeroNode(Op.getOperand(1))) {
9823 unsigned NumBits = VT.getSizeInBits();
9824 return getVShift(true, VT,
9825 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
9826 VT, Op.getOperand(1)),
9827 NumBits/2, DAG, *this, dl);
9828 }
9829
9830 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9831 return SDValue();
9832
9833 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9834 // is a non-constant being inserted into an element other than the low one,
9835 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9836 // movd/movss) to move this into the low element, then shuffle it into
9837 // place.
9838 if (EVTBits == 32) {
9839 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9840 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9841 }
9842 }
9843
9844 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9845 if (Values.size() == 1) {
9846 if (EVTBits == 32) {
9847 // Instead of a shuffle like this:
9848 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9849 // Check if it's possible to issue this instead.
9850 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9851 unsigned Idx = countTrailingZeros(NonZeros);
9852 SDValue Item = Op.getOperand(Idx);
9853 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9854 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9855 }
9856 return SDValue();
9857 }
9858
9859 // A vector full of immediates; various special cases are already
9860 // handled, so this is best done with a single constant-pool load.
9861 if (IsAllConstants)
9862 return SDValue();
9863
9864 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
9865 return V;
9866
9867 // See if we can use a vector load to get all of the elements.
9868 {
9869 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9870 if (SDValue LD =
9871 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9872 return LD;
9873 }
9874
9875 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9876 // build_vector and broadcast it.
9877 // TODO: We could probably generalize this more.
9878 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9879 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9880 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9881 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9882 // Make sure all the even/odd operands match.
9883 for (unsigned i = 2; i != NumElems; ++i)
9884 if (Ops[i % 2] != Op.getOperand(i))
9885 return false;
9886 return true;
9887 };
9888 if (CanSplat(Op, NumElems, Ops)) {
9889 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9890 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9891 // Create a new build vector and cast to v2i64/v2f64.
9892 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9893 DAG.getBuildVector(NarrowVT, dl, Ops));
9894 // Broadcast from v2i64/v2f64 and cast to final VT.
9895 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
9896 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9897 NewBV));
9898 }
9899 }
9900
9901 // For AVX-length vectors, build the individual 128-bit pieces and use
9902 // shuffles to put them in place.
9903 if (VT.getSizeInBits() > 128) {
9904 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
9905
9906 // Build both the lower and upper subvector.
9907 SDValue Lower =
9908 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9909 SDValue Upper = DAG.getBuildVector(
9910 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9911
9912 // Recreate the wider vector with the lower and upper part.
9913 return concatSubVectors(Lower, Upper, DAG, dl);
9914 }
9915
9916 // Let legalizer expand 2-wide build_vectors.
9917 if (EVTBits == 64) {
9918 if (NumNonZero == 1) {
9919 // One half is zero or undef.
9920 unsigned Idx = countTrailingZeros(NonZeros);
9921 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9922 Op.getOperand(Idx));
9923 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9924 }
9925 return SDValue();
9926 }
9927
9928 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9929 if (EVTBits == 8 && NumElems == 16)
9930 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
9931 DAG, Subtarget))
9932 return V;
9933
9934 if (EVTBits == 16 && NumElems == 8)
9935 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
9936 DAG, Subtarget))
9937 return V;
9938
9939 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9940 if (EVTBits == 32 && NumElems == 4)
9941 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
9942 return V;
9943
9944 // If element VT is == 32 bits, turn it into a number of shuffles.
9945 if (NumElems == 4 && NumZero > 0) {
9946 SmallVector<SDValue, 8> Ops(NumElems);
9947 for (unsigned i = 0; i < 4; ++i) {
9948 bool isZero = !(NonZeros & (1ULL << i));
9949 if (isZero)
9950 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9951 else
9952 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9953 }
9954
9955 for (unsigned i = 0; i < 2; ++i) {
9956 switch ((NonZeros >> (i*2)) & 0x3) {
9957 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9957)
;
9958 case 0:
9959 Ops[i] = Ops[i*2]; // Must be a zero vector.
9960 break;
9961 case 1:
9962 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9963 break;
9964 case 2:
9965 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9966 break;
9967 case 3:
9968 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9969 break;
9970 }
9971 }
9972
9973 bool Reverse1 = (NonZeros & 0x3) == 2;
9974 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
9975 int MaskVec[] = {
9976 Reverse1 ? 1 : 0,
9977 Reverse1 ? 0 : 1,
9978 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9979 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9980 };
9981 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9982 }
9983
9984 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? static_cast<void> (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9984, __PRETTY_FUNCTION__))
;
9985
9986 // Check for a build vector from mostly shuffle plus few inserting.
9987 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
9988 return Sh;
9989
9990 // For SSE 4.1, use insertps to put the high elements into the low element.
9991 if (Subtarget.hasSSE41()) {
9992 SDValue Result;
9993 if (!Op.getOperand(0).isUndef())
9994 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9995 else
9996 Result = DAG.getUNDEF(VT);
9997
9998 for (unsigned i = 1; i < NumElems; ++i) {
9999 if (Op.getOperand(i).isUndef()) continue;
10000 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10001 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10002 }
10003 return Result;
10004 }
10005
10006 // Otherwise, expand into a number of unpckl*, start by extending each of
10007 // our (non-undef) elements to the full vector width with the element in the
10008 // bottom slot of the vector (which generates no code for SSE).
10009 SmallVector<SDValue, 8> Ops(NumElems);
10010 for (unsigned i = 0; i < NumElems; ++i) {
10011 if (!Op.getOperand(i).isUndef())
10012 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10013 else
10014 Ops[i] = DAG.getUNDEF(VT);
10015 }
10016
10017 // Next, we iteratively mix elements, e.g. for v4f32:
10018 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10019 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10020 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10021 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10022 // Generate scaled UNPCKL shuffle mask.
10023 SmallVector<int, 16> Mask;
10024 for(unsigned i = 0; i != Scale; ++i)
10025 Mask.push_back(i);
10026 for (unsigned i = 0; i != Scale; ++i)
10027 Mask.push_back(NumElems+i);
10028 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10029
10030 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10031 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10032 }
10033 return Ops[0];
10034}
10035
10036// 256-bit AVX can use the vinsertf128 instruction
10037// to create 256-bit vectors from two other 128-bit ones.
10038// TODO: Detect subvector broadcast here instead of DAG combine?
10039static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10040 const X86Subtarget &Subtarget) {
10041 SDLoc dl(Op);
10042 MVT ResVT = Op.getSimpleValueType();
10043
10044 assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10045, __PRETTY_FUNCTION__))
10045 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10045, __PRETTY_FUNCTION__))
;
10046
10047 unsigned NumOperands = Op.getNumOperands();
10048 unsigned NumZero = 0;
10049 unsigned NumNonZero = 0;
10050 unsigned NonZeros = 0;
10051 for (unsigned i = 0; i != NumOperands; ++i) {
10052 SDValue SubVec = Op.getOperand(i);
10053 if (SubVec.isUndef())
10054 continue;
10055 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10056 ++NumZero;
10057 else {
10058 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10058, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10059 NonZeros |= 1 << i;
10060 ++NumNonZero;
10061 }
10062 }
10063
10064 // If we have more than 2 non-zeros, build each half separately.
10065 if (NumNonZero > 2) {
10066 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10067 ArrayRef<SDUse> Ops = Op->ops();
10068 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10069 Ops.slice(0, NumOperands/2));
10070 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10071 Ops.slice(NumOperands/2));
10072 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10073 }
10074
10075 // Otherwise, build it up through insert_subvectors.
10076 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10077 : DAG.getUNDEF(ResVT);
10078
10079 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10080 unsigned NumSubElems = SubVT.getVectorNumElements();
10081 for (unsigned i = 0; i != NumOperands; ++i) {
10082 if ((NonZeros & (1 << i)) == 0)
10083 continue;
10084
10085 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10086 Op.getOperand(i),
10087 DAG.getIntPtrConstant(i * NumSubElems, dl));
10088 }
10089
10090 return Vec;
10091}
10092
10093// Returns true if the given node is a type promotion (by concatenating i1
10094// zeros) of the result of a node that already zeros all upper bits of
10095// k-register.
10096// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10097static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10098 const X86Subtarget &Subtarget,
10099 SelectionDAG & DAG) {
10100 SDLoc dl(Op);
10101 MVT ResVT = Op.getSimpleValueType();
10102 unsigned NumOperands = Op.getNumOperands();
10103
10104 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10105, __PRETTY_FUNCTION__))
10105 "Unexpected number of operands in CONCAT_VECTORS")((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10105, __PRETTY_FUNCTION__))
;
10106
10107 uint64_t Zeros = 0;
10108 uint64_t NonZeros = 0;
10109 for (unsigned i = 0; i != NumOperands; ++i) {
10110 SDValue SubVec = Op.getOperand(i);
10111 if (SubVec.isUndef())
10112 continue;
10113 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10113, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10114 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10115 Zeros |= (uint64_t)1 << i;
10116 else
10117 NonZeros |= (uint64_t)1 << i;
10118 }
10119
10120 unsigned NumElems = ResVT.getVectorNumElements();
10121
10122 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10123 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10124 // insert_subvector will give us two kshifts.
10125 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10126 Log2_64(NonZeros) != NumOperands - 1) {
10127 MVT ShiftVT = ResVT;
10128 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10129 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10130 unsigned Idx = Log2_64(NonZeros);
10131 SDValue SubVec = Op.getOperand(Idx);
10132 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10133 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10134 DAG.getUNDEF(ShiftVT), SubVec,
10135 DAG.getIntPtrConstant(0, dl));
10136 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10137 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10138 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10139 DAG.getIntPtrConstant(0, dl));
10140 }
10141
10142 // If there are zero or one non-zeros we can handle this very simply.
10143 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10144 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10145 if (!NonZeros)
10146 return Vec;
10147 unsigned Idx = Log2_64(NonZeros);
10148 SDValue SubVec = Op.getOperand(Idx);
10149 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10150 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10151 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10152 }
10153
10154 if (NumOperands > 2) {
10155 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10156 ArrayRef<SDUse> Ops = Op->ops();
10157 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10158 Ops.slice(0, NumOperands/2));
10159 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10160 Ops.slice(NumOperands/2));
10161 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10162 }
10163
10164 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((countPopulation(NonZeros) == 2 && "Simple cases not handled?"
) ? static_cast<void> (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10164, __PRETTY_FUNCTION__))
;
10165
10166 if (ResVT.getVectorNumElements() >= 16)
10167 return Op; // The operation is legal with KUNPCK
10168
10169 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10170 DAG.getUNDEF(ResVT), Op.getOperand(0),
10171 DAG.getIntPtrConstant(0, dl));
10172 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10173 DAG.getIntPtrConstant(NumElems/2, dl));
10174}
10175
10176static SDValue LowerCONCAT_VECTORS(SDValue Op,
10177 const X86Subtarget &Subtarget,
10178 SelectionDAG &DAG) {
10179 MVT VT = Op.getSimpleValueType();
10180 if (VT.getVectorElementType() == MVT::i1)
10181 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10182
10183 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10185, __PRETTY_FUNCTION__))
10184 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10185, __PRETTY_FUNCTION__))
10185 Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10185, __PRETTY_FUNCTION__))
;
10186
10187 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10188 // from two other 128-bit ones.
10189
10190 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10191 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10192}
10193
10194//===----------------------------------------------------------------------===//
10195// Vector shuffle lowering
10196//
10197// This is an experimental code path for lowering vector shuffles on x86. It is
10198// designed to handle arbitrary vector shuffles and blends, gracefully
10199// degrading performance as necessary. It works hard to recognize idiomatic
10200// shuffles and lower them to optimal instruction patterns without leaving
10201// a framework that allows reasonably efficient handling of all vector shuffle
10202// patterns.
10203//===----------------------------------------------------------------------===//
10204
10205/// Tiny helper function to identify a no-op mask.
10206///
10207/// This is a somewhat boring predicate function. It checks whether the mask
10208/// array input, which is assumed to be a single-input shuffle mask of the kind
10209/// used by the X86 shuffle instructions (not a fully general
10210/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10211/// in-place shuffle are 'no-op's.
10212static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10213 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10214 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10214, __PRETTY_FUNCTION__))
;
10215 if (Mask[i] >= 0 && Mask[i] != i)
10216 return false;
10217 }
10218 return true;
10219}
10220
10221/// Test whether there are elements crossing 128-bit lanes in this
10222/// shuffle mask.
10223///
10224/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10225/// and we routinely test for these.
10226static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10227 int LaneSize = 128 / VT.getScalarSizeInBits();
10228 int Size = Mask.size();
10229 for (int i = 0; i < Size; ++i)
10230 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10231 return true;
10232 return false;
10233}
10234
10235/// Test whether a shuffle mask is equivalent within each sub-lane.
10236///
10237/// This checks a shuffle mask to see if it is performing the same
10238/// lane-relative shuffle in each sub-lane. This trivially implies
10239/// that it is also not lane-crossing. It may however involve a blend from the
10240/// same lane of a second vector.
10241///
10242/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10243/// non-trivial to compute in the face of undef lanes. The representation is
10244/// suitable for use with existing 128-bit shuffles as entries from the second
10245/// vector have been remapped to [LaneSize, 2*LaneSize).
10246static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10247 ArrayRef<int> Mask,
10248 SmallVectorImpl<int> &RepeatedMask) {
10249 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10250 RepeatedMask.assign(LaneSize, -1);
10251 int Size = Mask.size();
10252 for (int i = 0; i < Size; ++i) {
10253 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((Mask[i] == SM_SentinelUndef || Mask[i] >= 0) ? static_cast
<void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10253, __PRETTY_FUNCTION__))
;
10254 if (Mask[i] < 0)
10255 continue;
10256 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10257 // This entry crosses lanes, so there is no way to model this shuffle.
10258 return false;
10259
10260 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10261 // Adjust second vector indices to start at LaneSize instead of Size.
10262 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10263 : Mask[i] % LaneSize + LaneSize;
10264 if (RepeatedMask[i % LaneSize] < 0)
10265 // This is the first non-undef entry in this slot of a 128-bit lane.
10266 RepeatedMask[i % LaneSize] = LocalM;
10267 else if (RepeatedMask[i % LaneSize] != LocalM)
10268 // Found a mismatch with the repeated mask.
10269 return false;
10270 }
10271 return true;
10272}
10273
10274/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10275static bool
10276is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10277 SmallVectorImpl<int> &RepeatedMask) {
10278 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10279}
10280
10281static bool
10282is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10283 SmallVector<int, 32> RepeatedMask;
10284 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10285}
10286
10287/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10288static bool
10289is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10290 SmallVectorImpl<int> &RepeatedMask) {
10291 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10292}
10293
10294/// Test whether a target shuffle mask is equivalent within each sub-lane.
10295/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10296static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10297 ArrayRef<int> Mask,
10298 SmallVectorImpl<int> &RepeatedMask) {
10299 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10300 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10301 int Size = Mask.size();
10302 for (int i = 0; i < Size; ++i) {
10303 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? static_cast<
void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10303, __PRETTY_FUNCTION__))
;
10304 if (Mask[i] == SM_SentinelUndef)
10305 continue;
10306 if (Mask[i] == SM_SentinelZero) {
10307 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10308 return false;
10309 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10310 continue;
10311 }
10312 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10313 // This entry crosses lanes, so there is no way to model this shuffle.
10314 return false;
10315
10316 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10317 // Adjust second vector indices to start at LaneSize instead of Size.
10318 int LocalM =
10319 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
10320 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10321 // This is the first non-undef entry in this slot of a 128-bit lane.
10322 RepeatedMask[i % LaneSize] = LocalM;
10323 else if (RepeatedMask[i % LaneSize] != LocalM)
10324 // Found a mismatch with the repeated mask.
10325 return false;
10326 }
10327 return true;
10328}
10329
10330/// Checks whether a shuffle mask is equivalent to an explicit list of
10331/// arguments.
10332///
10333/// This is a fast way to test a shuffle mask against a fixed pattern:
10334///
10335/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10336///
10337/// It returns true if the mask is exactly as wide as the argument list, and
10338/// each element of the mask is either -1 (signifying undef) or the value given
10339/// in the argument.
10340static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
10341 ArrayRef<int> ExpectedMask) {
10342 if (Mask.size() != ExpectedMask.size())
10343 return false;
10344
10345 int Size = Mask.size();
10346
10347 // If the values are build vectors, we can look through them to find
10348 // equivalent inputs that make the shuffles equivalent.
10349 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
10350 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
10351
10352 for (int i = 0; i < Size; ++i) {
10353 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10353, __PRETTY_FUNCTION__))
;
10354 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
10355 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10356 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10357 if (!MaskBV || !ExpectedBV ||
10358 MaskBV->getOperand(Mask[i] % Size) !=
10359 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10360 return false;
10361 }
10362 }
10363
10364 return true;
10365}
10366
10367/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10368///
10369/// The masks must be exactly the same width.
10370///
10371/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10372/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10373///
10374/// SM_SentinelZero is accepted as a valid negative index but must match in
10375/// both.
10376static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
10377 ArrayRef<int> ExpectedMask,
10378 SDValue V1 = SDValue(),
10379 SDValue V2 = SDValue()) {
10380 int Size = Mask.size();
10381 if (Size != (int)ExpectedMask.size())
10382 return false;
10383 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10384, __PRETTY_FUNCTION__))
10384 "Illegal target shuffle mask")((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10384, __PRETTY_FUNCTION__))
;
10385
10386 // Check for out-of-range target shuffle mask indices.
10387 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10388 return false;
10389
10390 // If the values are build vectors, we can look through them to find
10391 // equivalent inputs that make the shuffles equivalent.
10392 auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
10393 auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
10394 BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
10395 BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
10396
10397 for (int i = 0; i < Size; ++i) {
10398 if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
10399 continue;
10400 if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
10401 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10402 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10403 if (MaskBV && ExpectedBV &&
10404 MaskBV->getOperand(Mask[i] % Size) ==
10405 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10406 continue;
10407 }
10408 // TODO - handle SM_Sentinel equivalences.
10409 return false;
10410 }
10411 return true;
10412}
10413
10414// Attempt to create a shuffle mask from a VSELECT condition mask.
10415static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
10416 SDValue Cond) {
10417 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
10418 return false;
10419
10420 unsigned Size = Cond.getValueType().getVectorNumElements();
10421 Mask.resize(Size, SM_SentinelUndef);
10422
10423 for (int i = 0; i != (int)Size; ++i) {
10424 SDValue CondElt = Cond.getOperand(i);
10425 Mask[i] = i;
10426 // Arbitrarily choose from the 2nd operand if the select condition element
10427 // is undef.
10428 // TODO: Can we do better by matching patterns such as even/odd?
10429 if (CondElt.isUndef() || isNullConstant(CondElt))
10430 Mask[i] += Size;
10431 }
10432
10433 return true;
10434}
10435
10436// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10437// instructions.
10438static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
10439 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10440 return false;
10441
10442 SmallVector<int, 8> Unpcklwd;
10443 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10444 /* Unary = */ false);
10445 SmallVector<int, 8> Unpckhwd;
10446 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10447 /* Unary = */ false);
10448 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
10449 isTargetShuffleEquivalent(Mask, Unpckhwd));
10450 return IsUnpackwdMask;
10451}
10452
10453static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
10454 // Create 128-bit vector type based on mask size.
10455 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10456 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10457
10458 // We can't assume a canonical shuffle mask, so try the commuted version too.
10459 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10460 ShuffleVectorSDNode::commuteMask(CommutedMask);
10461
10462 // Match any of unary/binary or low/high.
10463 for (unsigned i = 0; i != 4; ++i) {
10464 SmallVector<int, 16> UnpackMask;
10465 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10466 if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
10467 isTargetShuffleEquivalent(CommutedMask, UnpackMask))
10468 return true;
10469 }
10470 return false;
10471}
10472
10473/// Return true if a shuffle mask chooses elements identically in its top and
10474/// bottom halves. For example, any splat mask has the same top and bottom
10475/// halves. If an element is undefined in only one half of the mask, the halves
10476/// are not considered identical.
10477static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
10478 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10478, __PRETTY_FUNCTION__))
;
10479 unsigned HalfSize = Mask.size() / 2;
10480 for (unsigned i = 0; i != HalfSize; ++i) {
10481 if (Mask[i] != Mask[i + HalfSize])
10482 return false;
10483 }
10484 return true;
10485}
10486
10487/// Get a 4-lane 8-bit shuffle immediate for a mask.
10488///
10489/// This helper function produces an 8-bit shuffle immediate corresponding to
10490/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10491/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10492/// example.
10493///
10494/// NB: We rely heavily on "undef" masks preserving the input lane.
10495static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10496 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10496, __PRETTY_FUNCTION__))
;
10497 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10497, __PRETTY_FUNCTION__))
;
10498 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10498, __PRETTY_FUNCTION__))
;
10499 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10499, __PRETTY_FUNCTION__))
;
10500 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10500, __PRETTY_FUNCTION__))
;
10501
10502 unsigned Imm = 0;
10503 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10504 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10505 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10506 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10507 return Imm;
10508}
10509
10510static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
10511 SelectionDAG &DAG) {
10512 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10513}
10514
10515// The Shuffle result is as follow:
10516// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10517// Each Zeroable's element correspond to a particular Mask's element.
10518// As described in computeZeroableShuffleElements function.
10519//
10520// The function looks for a sub-mask that the nonzero elements are in
10521// increasing order. If such sub-mask exist. The function returns true.
10522static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10523 ArrayRef<int> Mask, const EVT &VectorType,
10524 bool &IsZeroSideLeft) {
10525 int NextElement = -1;
10526 // Check if the Mask's nonzero elements are in increasing order.
10527 for (int i = 0, e = Mask.size(); i < e; i++) {
10528 // Checks if the mask's zeros elements are built from only zeros.
10529 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10529, __PRETTY_FUNCTION__))
;
10530 if (Mask[i] < 0)
10531 return false;
10532 if (Zeroable[i])
10533 continue;
10534 // Find the lowest non zero element
10535 if (NextElement < 0) {
10536 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10537 IsZeroSideLeft = NextElement != 0;
10538 }
10539 // Exit if the mask's non zero elements are not in increasing order.
10540 if (NextElement != Mask[i])
10541 return false;
10542 NextElement++;
10543 }
10544 return true;
10545}
10546
10547/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10548static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
10549 ArrayRef<int> Mask, SDValue V1,
10550 SDValue V2, const APInt &Zeroable,
10551 const X86Subtarget &Subtarget,
10552 SelectionDAG &DAG) {
10553 int Size = Mask.size();
10554 int LaneSize = 128 / VT.getScalarSizeInBits();
10555 const int NumBytes = VT.getSizeInBits() / 8;
10556 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10557
10558 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10560, __PRETTY_FUNCTION__))
10559 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10560, __PRETTY_FUNCTION__))
10560 (Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10560, __PRETTY_FUNCTION__))
;
10561
10562 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10563 // Sign bit set in i8 mask means zero element.
10564 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10565
10566 SDValue V;
10567 for (int i = 0; i < NumBytes; ++i) {
10568 int M = Mask[i / NumEltBytes];
10569 if (M < 0) {
10570 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10571 continue;
10572 }
10573 if (Zeroable[i / NumEltBytes]) {
10574 PSHUFBMask[i] = ZeroMask;
10575 continue;
10576 }
10577
10578 // We can only use a single input of V1 or V2.
10579 SDValue SrcV = (M >= Size ? V2 : V1);
10580 if (V && V != SrcV)
10581 return SDValue();
10582 V = SrcV;
10583 M %= Size;
10584
10585 // PSHUFB can't cross lanes, ensure this doesn't happen.
10586 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10587 return SDValue();
10588
10589 M = M % LaneSize;
10590 M = M * NumEltBytes + (i % NumEltBytes);
10591 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10592 }
10593 assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast
<void> (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10593, __PRETTY_FUNCTION__))
;
10594
10595 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10596 return DAG.getBitcast(
10597 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10598 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10599}
10600
10601static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10602 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10603 const SDLoc &dl);
10604
10605// X86 has dedicated shuffle that can be lowered to VEXPAND
10606static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
10607 const APInt &Zeroable,
10608 ArrayRef<int> Mask, SDValue &V1,
10609 SDValue &V2, SelectionDAG &DAG,
10610 const X86Subtarget &Subtarget) {
10611 bool IsLeftZeroSide = true;
10612 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10613 IsLeftZeroSide))
10614 return SDValue();
10615 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10616 MVT IntegerType =
10617 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10618 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10619 unsigned NumElts = VT.getVectorNumElements();
10620 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10621, __PRETTY_FUNCTION__))
10621 "Unexpected number of vector elements")(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10621, __PRETTY_FUNCTION__))
;
10622 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10623 Subtarget, DAG, DL);
10624 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10625 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10626 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10627}
10628
10629static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10630 unsigned &UnpackOpcode, bool IsUnary,
10631 ArrayRef<int> TargetMask,
10632 const SDLoc &DL, SelectionDAG &DAG,
10633 const X86Subtarget &Subtarget) {
10634 int NumElts = VT.getVectorNumElements();
10635
10636 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10637 for (int i = 0; i != NumElts; i += 2) {
10638 int M1 = TargetMask[i + 0];
10639 int M2 = TargetMask[i + 1];
10640 Undef1 &= (SM_SentinelUndef == M1);
10641 Undef2 &= (SM_SentinelUndef == M2);
10642 Zero1 &= isUndefOrZero(M1);
10643 Zero2 &= isUndefOrZero(M2);
10644 }
10645 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10646, __PRETTY_FUNCTION__))
10646 "Zeroable shuffle detected")((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10646, __PRETTY_FUNCTION__))
;
10647
10648 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10649 SmallVector<int, 64> Unpckl, Unpckh;
10650 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10651 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10652 UnpackOpcode = X86ISD::UNPCKL;
10653 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10654 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10655 return true;
10656 }
10657
10658 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10659 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10660 UnpackOpcode = X86ISD::UNPCKH;
10661 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10662 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10663 return true;
10664 }
10665
10666 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10667 if (IsUnary && (Zero1 || Zero2)) {
10668 // Don't bother if we can blend instead.
10669 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10670 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10671 return false;
10672
10673 bool MatchLo = true, MatchHi = true;
10674 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10675 int M = TargetMask[i];
10676
10677 // Ignore if the input is known to be zero or the index is undef.
10678 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10679 (M == SM_SentinelUndef))
10680 continue;
10681
10682 MatchLo &= (M == Unpckl[i]);
10683 MatchHi &= (M == Unpckh[i]);
10684 }
10685
10686 if (MatchLo || MatchHi) {
10687 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10688 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10689 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10690 return true;
10691 }
10692 }
10693
10694 // If a binary shuffle, commute and try again.
10695 if (!IsUnary) {
10696 ShuffleVectorSDNode::commuteMask(Unpckl);
10697 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10698 UnpackOpcode = X86ISD::UNPCKL;
10699 std::swap(V1, V2);
10700 return true;
10701 }
10702
10703 ShuffleVectorSDNode::commuteMask(Unpckh);
10704 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10705 UnpackOpcode = X86ISD::UNPCKH;
10706 std::swap(V1, V2);
10707 return true;
10708 }
10709 }
10710
10711 return false;
10712}
10713
10714// X86 has dedicated unpack instructions that can handle specific blend
10715// operations: UNPCKH and UNPCKL.
10716static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
10717 ArrayRef<int> Mask, SDValue V1, SDValue V2,
10718 SelectionDAG &DAG) {
10719 SmallVector<int, 8> Unpckl;
10720 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10721 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10722 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10723
10724 SmallVector<int, 8> Unpckh;
10725 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10726 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10727 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10728
10729 // Commute and try again.
10730 ShuffleVectorSDNode::commuteMask(Unpckl);
10731 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10732 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10733
10734 ShuffleVectorSDNode::commuteMask(Unpckh);
10735 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10736 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10737
10738 return SDValue();
10739}
10740
10741static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
10742 int Delta) {
10743 int Size = (int)Mask.size();
10744 int Split = Size / Delta;
10745 int TruncatedVectorStart = SwappedOps ? Size : 0;
10746
10747 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
10748 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
10749 return false;
10750
10751 // The rest of the mask should not refer to the truncated vector's elements.
10752 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
10753 TruncatedVectorStart + Size))
10754 return false;
10755
10756 return true;
10757}
10758
10759// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10760//
10761// An example is the following:
10762//
10763// t0: ch = EntryToken
10764// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10765// t25: v4i32 = truncate t2
10766// t41: v8i16 = bitcast t25
10767// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10768// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10769// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10770// t18: v2i64 = bitcast t51
10771//
10772// Without avx512vl, this is lowered to:
10773//
10774// vpmovqd %zmm0, %ymm0
10775// vpshufb {{.*#+}} xmm0 =
10776// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
10777//
10778// But when avx512vl is available, one can just use a single vpmovdw
10779// instruction.
10780static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
10781 MVT VT, SDValue V1, SDValue V2,
10782 SelectionDAG &DAG,
10783 const X86Subtarget &Subtarget) {
10784 if (VT != MVT::v16i8 && VT != MVT::v8i16)
10785 return SDValue();
10786
10787 if (Mask.size() != VT.getVectorNumElements())
10788 return SDValue();
10789
10790 bool SwappedOps = false;
10791
10792 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
10793 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
10794 return SDValue();
10795
10796 std::swap(V1, V2);
10797 SwappedOps = true;
10798 }
10799
10800 // Look for:
10801 //
10802 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
10803 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
10804 //
10805 // and similar ones.
10806 if (V1.getOpcode() != ISD::BITCAST)
10807 return SDValue();
10808 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
10809 return SDValue();
10810
10811 SDValue Src = V1.getOperand(0).getOperand(0);
10812 MVT SrcVT = Src.getSimpleValueType();
10813
10814 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
10815 // are only available with avx512vl.
10816 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
10817 return SDValue();
10818
10819 // Down Convert Word to Byte is only available with avx512bw. The case with
10820 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
10821 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
10822 !Subtarget.hasBWI())
10823 return SDValue();
10824
10825 // The first half/quarter of the mask should refer to every second/fourth
10826 // element of the vector truncated and bitcasted.
10827 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
10828 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
10829 return SDValue();
10830
10831 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
10832}
10833
10834// X86 has dedicated pack instructions that can handle specific truncation
10835// operations: PACKSS and PACKUS.
10836static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
10837 SDValue &V2, unsigned &PackOpcode,
10838 ArrayRef<int> TargetMask,
10839 SelectionDAG &DAG,
10840 const X86Subtarget &Subtarget) {
10841 unsigned NumElts = VT.getVectorNumElements();
10842 unsigned BitSize = VT.getScalarSizeInBits();
10843 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
10844 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
10845
10846 auto MatchPACK = [&](SDValue N1, SDValue N2) {
10847 SDValue VV1 = DAG.getBitcast(PackVT, N1);
10848 SDValue VV2 = DAG.getBitcast(PackVT, N2);
10849 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
10850 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
10851 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
10852 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
10853 V1 = VV1;
10854 V2 = VV2;
10855 SrcVT = PackVT;
10856 PackOpcode = X86ISD::PACKUS;
10857 return true;
10858 }
10859 }
10860 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
10861 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
10862 V1 = VV1;
10863 V2 = VV2;
10864 SrcVT = PackVT;
10865 PackOpcode = X86ISD::PACKSS;
10866 return true;
10867 }
10868 return false;
10869 };
10870
10871 // Try binary shuffle.
10872 SmallVector<int, 32> BinaryMask;
10873 createPackShuffleMask(VT, BinaryMask, false);
10874 if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
10875 if (MatchPACK(V1, V2))
10876 return true;
10877
10878 // Try unary shuffle.
10879 SmallVector<int, 32> UnaryMask;
10880 createPackShuffleMask(VT, UnaryMask, true);
10881 if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
10882 if (MatchPACK(V1, V1))
10883 return true;
10884
10885 return false;
10886}
10887
10888static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
10889 SDValue V1, SDValue V2, SelectionDAG &DAG,
10890 const X86Subtarget &Subtarget) {
10891 MVT PackVT;
10892 unsigned PackOpcode;
10893 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10894 Subtarget))
10895 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
10896 DAG.getBitcast(PackVT, V2));
10897
10898 return SDValue();
10899}
10900
10901/// Try to emit a bitmask instruction for a shuffle.
10902///
10903/// This handles cases where we can model a blend exactly as a bitmask due to
10904/// one of the inputs being zeroable.
10905static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
10906 SDValue V2, ArrayRef<int> Mask,
10907 const APInt &Zeroable,
10908 const X86Subtarget &Subtarget,
10909 SelectionDAG &DAG) {
10910 MVT MaskVT = VT;
10911 MVT EltVT = VT.getVectorElementType();
10912 SDValue Zero, AllOnes;
10913 // Use f64 if i64 isn't legal.
10914 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10915 EltVT = MVT::f64;
10916 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10917 }
10918
10919 MVT LogicVT = VT;
10920 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10921 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10922 AllOnes = DAG.getConstantFP(
10923 APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
10924 LogicVT =
10925 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10926 } else {
10927 Zero = DAG.getConstant(0, DL, EltVT);
10928 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10929 }
10930
10931 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10932 SDValue V;
10933 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10934 if (Zeroable[i])
10935 continue;
10936 if (Mask[i] % Size != i)
10937 return SDValue(); // Not a blend.
10938 if (!V)
10939 V = Mask[i] < Size ? V1 : V2;
10940 else if (V != (Mask[i] < Size ? V1 : V2))
10941 return SDValue(); // Can only let one input through the mask.
10942
10943 VMaskOps[i] = AllOnes;
10944 }
10945 if (!V)
10946 return SDValue(); // No non-zeroable elements!
10947
10948 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10949 VMask = DAG.getBitcast(LogicVT, VMask);
10950 V = DAG.getBitcast(LogicVT, V);
10951 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10952 return DAG.getBitcast(VT, And);
10953}
10954
10955/// Try to emit a blend instruction for a shuffle using bit math.
10956///
10957/// This is used as a fallback approach when first class blend instructions are
10958/// unavailable. Currently it is only suitable for integer vectors, but could
10959/// be generalized for floating point vectors if desirable.
10960static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
10961 SDValue V2, ArrayRef<int> Mask,
10962 SelectionDAG &DAG) {
10963 assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10963, __PRETTY_FUNCTION__))
;
10964 MVT EltVT = VT.getVectorElementType();
10965 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10966 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10967 SmallVector<SDValue, 16> MaskOps;
10968 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10969 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10970 return SDValue(); // Shuffled input!
10971 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10972 }
10973
10974 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10975 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
10976 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
10977 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
10978}
10979
10980static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
10981 SDValue PreservedSrc,
10982 const X86Subtarget &Subtarget,
10983 SelectionDAG &DAG);
10984
10985static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
10986 MutableArrayRef<int> Mask,
10987 const APInt &Zeroable, bool &ForceV1Zero,
10988 bool &ForceV2Zero, uint64_t &BlendMask) {
10989 bool V1IsZeroOrUndef =
10990 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
10991 bool V2IsZeroOrUndef =
10992 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10993
10994 BlendMask = 0;
10995 ForceV1Zero = false, ForceV2Zero = false;
10996 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10996, __PRETTY_FUNCTION__))
;
10997
10998 // Attempt to generate the binary blend mask. If an input is zero then
10999 // we can use any lane.
11000 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11001 int M = Mask[i];
11002 if (M == SM_SentinelUndef)
11003 continue;
11004 if (M == i)
11005 continue;
11006 if (M == i + Size) {
11007 BlendMask |= 1ull << i;
11008 continue;
11009 }
11010 if (Zeroable[i]) {
11011 if (V1IsZeroOrUndef) {
11012 ForceV1Zero = true;
11013 Mask[i] = i;
11014 continue;
11015 }
11016 if (V2IsZeroOrUndef) {
11017 ForceV2Zero = true;
11018 BlendMask |= 1ull << i;
11019 Mask[i] = i + Size;
11020 continue;
11021 }
11022 }
11023 return false;
11024 }
11025 return true;
11026}
11027
11028static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
11029 int Scale) {
11030 uint64_t ScaledMask = 0;
11031 for (int i = 0; i != Size; ++i)
11032 if (BlendMask & (1ull << i))
11033 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
11034 return ScaledMask;
11035}
11036
11037/// Try to emit a blend instruction for a shuffle.
11038///
11039/// This doesn't do any checks for the availability of instructions for blending
11040/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11041/// be matched in the backend with the type given. What it does check for is
11042/// that the shuffle mask is a blend, or convertible into a blend with zero.
11043static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
11044 SDValue V2, ArrayRef<int> Original,
11045 const APInt &Zeroable,
11046 const X86Subtarget &Subtarget,
11047 SelectionDAG &DAG) {
11048 uint64_t BlendMask = 0;
11049 bool ForceV1Zero = false, ForceV2Zero = false;
11050 SmallVector<int, 64> Mask(Original.begin(), Original.end());
11051 if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11052 BlendMask))
11053 return SDValue();
11054
11055 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11056 if (ForceV1Zero)
11057 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11058 if (ForceV2Zero)
11059 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11060
11061 switch (VT.SimpleTy) {
11062 case MVT::v4i64:
11063 case MVT::v8i32:
11064 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11064, __PRETTY_FUNCTION__))
;
11065 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11066 case MVT::v4f64:
11067 case MVT::v8f32:
11068 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11068, __PRETTY_FUNCTION__))
;
11069 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11070 case MVT::v2f64:
11071 case MVT::v2i64:
11072 case MVT::v4f32:
11073 case MVT::v4i32:
11074 case MVT::v8i16:
11075 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11075, __PRETTY_FUNCTION__))
;
11076 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11077 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11078 case MVT::v16i16: {
11079 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11079, __PRETTY_FUNCTION__))
;
11080 SmallVector<int, 8> RepeatedMask;
11081 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11082 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11083 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11083, __PRETTY_FUNCTION__))
;
11084 BlendMask = 0;
11085 for (int i = 0; i < 8; ++i)
11086 if (RepeatedMask[i] >= 8)
11087 BlendMask |= 1ull << i;
11088 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11089 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11090 }
11091 // Use PBLENDW for lower/upper lanes and then blend lanes.
11092 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11093 // merge to VSELECT where useful.
11094 uint64_t LoMask = BlendMask & 0xFF;
11095 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11096 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11097 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11098 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11099 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11100 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11101 return DAG.getVectorShuffle(
11102 MVT::v16i16, DL, Lo, Hi,
11103 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11104 }
11105 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11106 }
11107 case MVT::v32i8:
11108 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11108, __PRETTY_FUNCTION__))
;
11109 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11110 case MVT::v16i8: {
11111 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11111, __PRETTY_FUNCTION__))
;
11112
11113 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11114 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11115 Subtarget, DAG))
11116 return Masked;
11117
11118 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11119 MVT IntegerType =
11120 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11121 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11122 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11123 }
11124
11125 // Scale the blend by the number of bytes per element.
11126 int Scale = VT.getScalarSizeInBits() / 8;
11127
11128 // This form of blend is always done on bytes. Compute the byte vector
11129 // type.
11130 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11131
11132 // x86 allows load folding with blendvb from the 2nd source operand. But
11133 // we are still using LLVM select here (see comment below), so that's V1.
11134 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11135 // allow that load-folding possibility.
11136 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11137 ShuffleVectorSDNode::commuteMask(Mask);
11138 std::swap(V1, V2);
11139 }
11140
11141 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11142 // mix of LLVM's code generator and the x86 backend. We tell the code
11143 // generator that boolean values in the elements of an x86 vector register
11144 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11145 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11146 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11147 // of the element (the remaining are ignored) and 0 in that high bit would
11148 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11149 // the LLVM model for boolean values in vector elements gets the relevant
11150 // bit set, it is set backwards and over constrained relative to x86's
11151 // actual model.
11152 SmallVector<SDValue, 32> VSELECTMask;
11153 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11154 for (int j = 0; j < Scale; ++j)
11155 VSELECTMask.push_back(
11156 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
11157 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
11158 MVT::i8));
11159
11160 V1 = DAG.getBitcast(BlendVT, V1);
11161 V2 = DAG.getBitcast(BlendVT, V2);
11162 return DAG.getBitcast(
11163 VT,
11164 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11165 V1, V2));
11166 }
11167 case MVT::v16f32:
11168 case MVT::v8f64:
11169 case MVT::v8i64:
11170 case MVT::v16i32:
11171 case MVT::v32i16:
11172 case MVT::v64i8: {
11173 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11174 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
11175 if (!OptForSize) {
11176 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11177 Subtarget, DAG))
11178 return Masked;
11179 }
11180
11181 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11182 // masked move.
11183 MVT IntegerType =
11184 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11185 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11186 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11187 }
11188 default:
11189 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11189)
;
11190 }
11191}
11192
11193/// Try to lower as a blend of elements from two inputs followed by
11194/// a single-input permutation.
11195///
11196/// This matches the pattern where we can blend elements from two inputs and
11197/// then reduce the shuffle to a single-input permutation.
11198static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
11199 SDValue V1, SDValue V2,
11200 ArrayRef<int> Mask,
11201 SelectionDAG &DAG,
11202 bool ImmBlends = false) {
11203 // We build up the blend mask while checking whether a blend is a viable way
11204 // to reduce the shuffle.
11205 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11206 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11207
11208 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11209 if (Mask[i] < 0)
11210 continue;
11211
11212 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11212, __PRETTY_FUNCTION__))
;
11213
11214 if (BlendMask[Mask[i] % Size] < 0)
11215 BlendMask[Mask[i] % Size] = Mask[i];
11216 else if (BlendMask[Mask[i] % Size] != Mask[i])
11217 return SDValue(); // Can't blend in the needed input!
11218
11219 PermuteMask[i] = Mask[i] % Size;
11220 }
11221
11222 // If only immediate blends, then bail if the blend mask can't be widened to
11223 // i16.
11224 unsigned EltSize = VT.getScalarSizeInBits();
11225 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11226 return SDValue();
11227
11228 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11229 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11230}
11231
11232/// Try to lower as an unpack of elements from two inputs followed by
11233/// a single-input permutation.
11234///
11235/// This matches the pattern where we can unpack elements from two inputs and
11236/// then reduce the shuffle to a single-input (wider) permutation.
11237static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
11238 SDValue V1, SDValue V2,
11239 ArrayRef<int> Mask,
11240 SelectionDAG &DAG) {
11241 int NumElts = Mask.size();
11242 int NumLanes = VT.getSizeInBits() / 128;
11243 int NumLaneElts = NumElts / NumLanes;
11244 int NumHalfLaneElts = NumLaneElts / 2;
11245
11246 bool MatchLo = true, MatchHi = true;
11247 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11248
11249 // Determine UNPCKL/UNPCKH type and operand order.
11250 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11251 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
11252 int M = Mask[Lane + Elt];
11253 if (M < 0)
11254 continue;
11255
11256 SDValue &Op = Ops[Elt & 1];
11257 if (M < NumElts && (Op.isUndef() || Op == V1))
11258 Op = V1;
11259 else if (NumElts <= M && (Op.isUndef() || Op == V2))
11260 Op = V2;
11261 else
11262 return SDValue();
11263
11264 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11265 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
11266 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
11267 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
11268 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
11269 if (!MatchLo && !MatchHi)
11270 return SDValue();
11271 }
11272 }
11273 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? static_cast<void> (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11273, __PRETTY_FUNCTION__))
;
11274
11275 // Now check that each pair of elts come from the same unpack pair
11276 // and set the permute mask based on each pair.
11277 // TODO - Investigate cases where we permute individual elements.
11278 SmallVector<int, 32> PermuteMask(NumElts, -1);
11279 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11280 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
11281 int M0 = Mask[Lane + Elt + 0];
11282 int M1 = Mask[Lane + Elt + 1];
11283 if (0 <= M0 && 0 <= M1 &&
11284 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
11285 return SDValue();
11286 if (0 <= M0)
11287 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
11288 if (0 <= M1)
11289 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
11290 }
11291 }
11292
11293 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11294 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11295 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11296}
11297
11298/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11299/// permuting the elements of the result in place.
11300static SDValue lowerShuffleAsByteRotateAndPermute(
11301 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11302 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11303 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11304 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11305 (VT.is512BitVector() && !Subtarget.hasBWI()))
11306 return SDValue();
11307
11308 // We don't currently support lane crossing permutes.
11309 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11310 return SDValue();
11311
11312 int Scale = VT.getScalarSizeInBits() / 8;
11313 int NumLanes = VT.getSizeInBits() / 128;
11314 int NumElts = VT.getVectorNumElements();
11315 int NumEltsPerLane = NumElts / NumLanes;
11316
11317 // Determine range of mask elts.
11318 bool Blend1 = true;
11319 bool Blend2 = true;
11320 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11321 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11322 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11323 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11324 int M = Mask[Lane + Elt];
11325 if (M < 0)
11326 continue;
11327 if (M < NumElts) {
11328 Blend1 &= (M == (Lane + Elt));
11329 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11329, __PRETTY_FUNCTION__))
;
11330 M = M % NumEltsPerLane;
11331 Range1.first = std::min(Range1.first, M);
11332 Range1.second = std::max(Range1.second, M);
11333 } else {
11334 M -= NumElts;
11335 Blend2 &= (M == (Lane + Elt));
11336 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11336, __PRETTY_FUNCTION__))
;
11337 M = M % NumEltsPerLane;
11338 Range2.first = std::min(Range2.first, M);
11339 Range2.second = std::max(Range2.second, M);
11340 }
11341 }
11342 }
11343
11344 // Bail if we don't need both elements.
11345 // TODO - it might be worth doing this for unary shuffles if the permute
11346 // can be widened.
11347 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11348 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11349 return SDValue();
11350
11351 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11352 return SDValue();
11353
11354 // Rotate the 2 ops so we can access both ranges, then permute the result.
11355 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11356 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11357 SDValue Rotate = DAG.getBitcast(
11358 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11359 DAG.getBitcast(ByteVT, Lo),
11360 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11361 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11362 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11363 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11364 int M = Mask[Lane + Elt];
11365 if (M < 0)
11366 continue;
11367 if (M < NumElts)
11368 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11369 else
11370 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11371 }
11372 }
11373 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11374 };
11375
11376 // Check if the ranges are small enough to rotate from either direction.
11377 if (Range2.second < Range1.first)
11378 return RotateAndPermute(V1, V2, Range1.first, 0);
11379 if (Range1.second < Range2.first)
11380 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11381 return SDValue();
11382}
11383
11384/// Generic routine to decompose a shuffle and blend into independent
11385/// blends and permutes.
11386///
11387/// This matches the extremely common pattern for handling combined
11388/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11389/// operations. It will try to pick the best arrangement of shuffles and
11390/// blends.
11391static SDValue lowerShuffleAsDecomposedShuffleBlend(
11392 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11393 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11394 // Shuffle the input elements into the desired positions in V1 and V2 and
11395 // blend them together.
11396 SmallVector<int, 32> V1Mask(Mask.size(), -1);
11397 SmallVector<int, 32> V2Mask(Mask.size(), -1);
11398 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11399 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11400 if (Mask[i] >= 0 && Mask[i] < Size) {
11401 V1Mask[i] = Mask[i];
11402 BlendMask[i] = i;
11403 } else if (Mask[i] >= Size) {
11404 V2Mask[i] = Mask[i] - Size;
11405 BlendMask[i] = i + Size;
11406 }
11407
11408 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11409 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11410 // the shuffle may be able to fold with a load or other benefit. However, when
11411 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11412 // pre-shuffle first is a better strategy.
11413 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11414 // Only prefer immediate blends to unpack/rotate.
11415 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11416 DAG, true))
11417 return BlendPerm;
11418 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
11419 DAG))
11420 return UnpackPerm;
11421 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
11422 DL, VT, V1, V2, Mask, Subtarget, DAG))
11423 return RotatePerm;
11424 // Unpack/rotate failed - try again with variable blends.
11425 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11426 DAG))
11427 return BlendPerm;
11428 }
11429
11430 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11431 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11432 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11433}
11434
11435/// Try to lower a vector shuffle as a rotation.
11436///
11437/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11438static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
11439 int NumElts = Mask.size();
11440
11441 // We need to detect various ways of spelling a rotation:
11442 // [11, 12, 13, 14, 15, 0, 1, 2]
11443 // [-1, 12, 13, 14, -1, -1, 1, -1]
11444 // [-1, -1, -1, -1, -1, -1, 1, 2]
11445 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11446 // [-1, 4, 5, 6, -1, -1, 9, -1]
11447 // [-1, 4, 5, 6, -1, -1, -1, -1]
11448 int Rotation = 0;
11449 SDValue Lo, Hi;
11450 for (int i = 0; i < NumElts; ++i) {
11451 int M = Mask[i];
11452 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11453, __PRETTY_FUNCTION__))
11453 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11453, __PRETTY_FUNCTION__))
;
11454 if (M < 0)
11455 continue;
11456
11457 // Determine where a rotated vector would have started.
11458 int StartIdx = i - (M % NumElts);
11459 if (StartIdx == 0)
11460 // The identity rotation isn't interesting, stop.
11461 return -1;
11462
11463 // If we found the tail of a vector the rotation must be the missing
11464 // front. If we found the head of a vector, it must be how much of the
11465 // head.
11466 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11467
11468 if (Rotation == 0)
11469 Rotation = CandidateRotation;
11470 else if (Rotation != CandidateRotation)
11471 // The rotations don't match, so we can't match this mask.
11472 return -1;
11473
11474 // Compute which value this mask is pointing at.
11475 SDValue MaskV = M < NumElts ? V1 : V2;
11476
11477 // Compute which of the two target values this index should be assigned
11478 // to. This reflects whether the high elements are remaining or the low
11479 // elements are remaining.
11480 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11481
11482 // Either set up this value if we've not encountered it before, or check
11483 // that it remains consistent.
11484 if (!TargetV)
11485 TargetV = MaskV;
11486 else if (TargetV != MaskV)
11487 // This may be a rotation, but it pulls from the inputs in some
11488 // unsupported interleaving.
11489 return -1;
11490 }
11491
11492 // Check that we successfully analyzed the mask, and normalize the results.
11493 assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11493, __PRETTY_FUNCTION__))
;
11494 assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11494, __PRETTY_FUNCTION__))
;
11495 if (!Lo)
11496 Lo = Hi;
11497 else if (!Hi)
11498 Hi = Lo;
11499
11500 V1 = Lo;
11501 V2 = Hi;
11502
11503 return Rotation;
11504}
11505
11506/// Try to lower a vector shuffle as a byte rotation.
11507///
11508/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11509/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11510/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11511/// try to generically lower a vector shuffle through such an pattern. It
11512/// does not check for the profitability of lowering either as PALIGNR or
11513/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11514/// This matches shuffle vectors that look like:
11515///
11516/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11517///
11518/// Essentially it concatenates V1 and V2, shifts right by some number of
11519/// elements, and takes the low elements as the result. Note that while this is
11520/// specified as a *right shift* because x86 is little-endian, it is a *left
11521/// rotate* of the vector lanes.
11522static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
11523 ArrayRef<int> Mask) {
11524 // Don't accept any shuffles with zero elements.
11525 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
11526 return -1;
11527
11528 // PALIGNR works on 128-bit lanes.
11529 SmallVector<int, 16> RepeatedMask;
11530 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11531 return -1;
11532
11533 int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
11534 if (Rotation <= 0)
11535 return -1;
11536
11537 // PALIGNR rotates bytes, so we need to scale the
11538 // rotation based on how many bytes are in the vector lane.
11539 int NumElts = RepeatedMask.size();
11540 int Scale = 16 / NumElts;
11541 return Rotation * Scale;
11542}
11543
11544static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
11545 SDValue V2, ArrayRef<int> Mask,
11546 const X86Subtarget &Subtarget,
11547 SelectionDAG &DAG) {
11548 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11548, __PRETTY_FUNCTION__))
;
11549
11550 SDValue Lo = V1, Hi = V2;
11551 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11552 if (ByteRotation <= 0)
11553 return SDValue();
11554
11555 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11556 // PSLLDQ/PSRLDQ.
11557 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11558 Lo = DAG.getBitcast(ByteVT, Lo);
11559 Hi = DAG.getBitcast(ByteVT, Hi);
11560
11561 // SSSE3 targets can use the palignr instruction.
11562 if (Subtarget.hasSSSE3()) {
11563 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11564, __PRETTY_FUNCTION__))
11564 "512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11564, __PRETTY_FUNCTION__))
;
11565 return DAG.getBitcast(
11566 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11567 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11568 }
11569
11570 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11571, __PRETTY_FUNCTION__))
11571 "Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11571, __PRETTY_FUNCTION__))
;
11572 assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11573, __PRETTY_FUNCTION__))
11573 "Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11573, __PRETTY_FUNCTION__))
;
11574 assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11575, __PRETTY_FUNCTION__))
11575 "SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11575, __PRETTY_FUNCTION__))
;
11576
11577 // Default SSE2 implementation
11578 int LoByteShift = 16 - ByteRotation;
11579 int HiByteShift = ByteRotation;
11580
11581 SDValue LoShift =
11582 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11583 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11584 SDValue HiShift =
11585 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11586 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11587 return DAG.getBitcast(VT,
11588 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11589}
11590
11591/// Try to lower a vector shuffle as a dword/qword rotation.
11592///
11593/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11594/// rotation of the concatenation of two vectors; This routine will
11595/// try to generically lower a vector shuffle through such an pattern.
11596///
11597/// Essentially it concatenates V1 and V2, shifts right by some number of
11598/// elements, and takes the low elements as the result. Note that while this is
11599/// specified as a *right shift* because x86 is little-endian, it is a *left
11600/// rotate* of the vector lanes.
11601static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
11602 SDValue V2, ArrayRef<int> Mask,
11603 const X86Subtarget &Subtarget,
11604 SelectionDAG &DAG) {
11605 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11606, __PRETTY_FUNCTION__))
11606 "Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11606, __PRETTY_FUNCTION__))
;
11607
11608 // 128/256-bit vectors are only supported with VLX.
11609 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11610, __PRETTY_FUNCTION__))
11610 && "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11610, __PRETTY_FUNCTION__))
;
11611
11612 SDValue Lo = V1, Hi = V2;
11613 int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
11614 if (Rotation <= 0)
11615 return SDValue();
11616
11617 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11618 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11619}
11620
11621/// Try to lower a vector shuffle as a byte shift sequence.
11622static SDValue lowerVectorShuffleAsByteShiftMask(
11623 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11624 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11625 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11625, __PRETTY_FUNCTION__))
;
11626 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((VT.is128BitVector() && "Only 128-bit vectors supported"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11626, __PRETTY_FUNCTION__))
;
11627
11628 // We need a shuffle that has zeros at one/both ends and a sequential
11629 // shuffle from one source within.
11630 unsigned ZeroLo = Zeroable.countTrailingOnes();
11631 unsigned ZeroHi = Zeroable.countLeadingOnes();
11632 if (!ZeroLo && !ZeroHi)
11633 return SDValue();
11634
11635 unsigned NumElts = Mask.size();
11636 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11637 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11638 return SDValue();
11639
11640 unsigned Scale = VT.getScalarSizeInBits() / 8;
11641 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11642 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11643 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11644 return SDValue();
11645
11646 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11647 Res = DAG.getBitcast(MVT::v16i8, Res);
11648
11649 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11650 // inner sequential set of elements, possibly offset:
11651 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11652 // 01234567 --> 4567zzzz --> zzzzz456
11653 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11654 if (ZeroLo == 0) {
11655 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11656 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11657 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11658 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11659 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11660 } else if (ZeroHi == 0) {
11661 unsigned Shift = Mask[ZeroLo] % NumElts;
11662 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11663 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11664 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11665 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11666 } else if (!Subtarget.hasSSSE3()) {
11667 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11668 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11669 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11670 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11671 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11672 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11673 Shift += Mask[ZeroLo] % NumElts;
11674 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11675 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11676 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11677 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11678 } else
11679 return SDValue();
11680
11681 return DAG.getBitcast(VT, Res);
11682}
11683
11684/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11685///
11686/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11687/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11688/// matches elements from one of the input vectors shuffled to the left or
11689/// right with zeroable elements 'shifted in'. It handles both the strictly
11690/// bit-wise element shifts and the byte shift across an entire 128-bit double
11691/// quad word lane.
11692///
11693/// PSHL : (little-endian) left bit shift.
11694/// [ zz, 0, zz, 2 ]
11695/// [ -1, 4, zz, -1 ]
11696/// PSRL : (little-endian) right bit shift.
11697/// [ 1, zz, 3, zz]
11698/// [ -1, -1, 7, zz]
11699/// PSLLDQ : (little-endian) left byte shift
11700/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11701/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11702/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11703/// PSRLDQ : (little-endian) right byte shift
11704/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11705/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11706/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11707static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11708 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11709 int MaskOffset, const APInt &Zeroable,
11710 const X86Subtarget &Subtarget) {
11711 int Size = Mask.size();
11712 unsigned SizeInBits = Size * ScalarSizeInBits;
11713
11714 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11715 for (int i = 0; i < Size; i += Scale)
11716 for (int j = 0; j < Shift; ++j)
11717 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11718 return false;
11719
11720 return true;
11721 };
11722
11723 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11724 for (int i = 0; i != Size; i += Scale) {
11725 unsigned Pos = Left ? i + Shift : i;
11726 unsigned Low = Left ? i : i + Shift;
11727 unsigned Len = Scale - Shift;
11728 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11729 return -1;
11730 }
11731
11732 int ShiftEltBits = ScalarSizeInBits * Scale;
11733 bool ByteShift = ShiftEltBits > 64;
11734 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11735 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11736 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11737
11738 // Normalize the scale for byte shifts to still produce an i64 element
11739 // type.
11740 Scale = ByteShift ? Scale / 2 : Scale;
11741
11742 // We need to round trip through the appropriate type for the shift.
11743 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11744 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11745 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11746 return (int)ShiftAmt;
11747 };
11748
11749 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11750 // keep doubling the size of the integer elements up to that. We can
11751 // then shift the elements of the integer vector by whole multiples of
11752 // their width within the elements of the larger integer vector. Test each
11753 // multiple to see if we can find a match with the moved element indices
11754 // and that the shifted in elements are all zeroable.
11755 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11756 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11757 for (int Shift = 1; Shift != Scale; ++Shift)
11758 for (bool Left : {true, false})
11759 if (CheckZeros(Shift, Scale, Left)) {
11760 int ShiftAmt = MatchShift(Shift, Scale, Left);
11761 if (0 < ShiftAmt)
11762 return ShiftAmt;
11763 }
11764
11765 // no match
11766 return -1;
11767}
11768
11769static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
11770 SDValue V2, ArrayRef<int> Mask,
11771 const APInt &Zeroable,
11772 const X86Subtarget &Subtarget,
11773 SelectionDAG &DAG) {
11774 int Size = Mask.size();
11775 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11775, __PRETTY_FUNCTION__))
;
11776
11777 MVT ShiftVT;
11778 SDValue V = V1;
11779 unsigned Opcode;
11780
11781 // Try to match shuffle against V1 shift.
11782 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11783 Mask, 0, Zeroable, Subtarget);
11784
11785 // If V1 failed, try to match shuffle against V2 shift.
11786 if (ShiftAmt < 0) {
11787 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11788 Mask, Size, Zeroable, Subtarget);
11789 V = V2;
11790 }
11791
11792 if (ShiftAmt < 0)
11793 return SDValue();
11794
11795 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11796, __PRETTY_FUNCTION__))
11796 "Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11796, __PRETTY_FUNCTION__))
;
11797 V = DAG.getBitcast(ShiftVT, V);
11798 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11799 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11800 return DAG.getBitcast(VT, V);
11801}
11802
11803// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11804// Remainder of lower half result is zero and upper half is all undef.
11805static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11806 ArrayRef<int> Mask, uint64_t &BitLen,
11807 uint64_t &BitIdx, const APInt &Zeroable) {
11808 int Size = Mask.size();
11809 int HalfSize = Size / 2;
11810 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11810, __PRETTY_FUNCTION__))
;
11811 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11811, __PRETTY_FUNCTION__))
;
11812
11813 // Upper half must be undefined.
11814 if (!isUndefUpperHalf(Mask))
11815 return false;
11816
11817 // Determine the extraction length from the part of the
11818 // lower half that isn't zeroable.
11819 int Len = HalfSize;
11820 for (; Len > 0; --Len)
11821 if (!Zeroable[Len - 1])
11822 break;
11823 assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast
<void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11823, __PRETTY_FUNCTION__))
;
11824
11825 // Attempt to match first Len sequential elements from the lower half.
11826 SDValue Src;
11827 int Idx = -1;
11828 for (int i = 0; i != Len; ++i) {
11829 int M = Mask[i];
11830 if (M == SM_SentinelUndef)
11831 continue;
11832 SDValue &V = (M < Size ? V1 : V2);
11833 M = M % Size;
11834
11835 // The extracted elements must start at a valid index and all mask
11836 // elements must be in the lower half.
11837 if (i > M || M >= HalfSize)
11838 return false;
11839
11840 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11841 Src = V;
11842 Idx = M - i;
11843 continue;
11844 }
11845 return false;
11846 }
11847
11848 if (!Src || Idx < 0)
11849 return false;
11850
11851 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask"
) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11851, __PRETTY_FUNCTION__))
;
11852 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11853 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11854 V1 = Src;
11855 return true;
11856}
11857
11858// INSERTQ: Extract lowest Len elements from lower half of second source and
11859// insert over first source, starting at Idx.
11860// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11861static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11862 ArrayRef<int> Mask, uint64_t &BitLen,
11863 uint64_t &BitIdx) {
11864 int Size = Mask.size();
11865 int HalfSize = Size / 2;
11866 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11866, __PRETTY_FUNCTION__))
;
11867
11868 // Upper half must be undefined.
11869 if (!isUndefUpperHalf(Mask))
11870 return false;
11871
11872 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11873 SDValue Base;
11874
11875 // Attempt to match first source from mask before insertion point.
11876 if (isUndefInRange(Mask, 0, Idx)) {
11877 /* EMPTY */
11878 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11879 Base = V1;
11880 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11881 Base = V2;
11882 } else {
11883 continue;
11884 }
11885
11886 // Extend the extraction length looking to match both the insertion of
11887 // the second source and the remaining elements of the first.
11888 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11889 SDValue Insert;
11890 int Len = Hi - Idx;
11891
11892 // Match insertion.
11893 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11894 Insert = V1;
11895 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11896 Insert = V2;
11897 } else {
11898 continue;
11899 }
11900
11901 // Match the remaining elements of the lower half.
11902 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11903 /* EMPTY */
11904 } else if ((!Base || (Base == V1)) &&
11905 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11906 Base = V1;
11907 } else if ((!Base || (Base == V2)) &&
11908 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11909 Size + Hi)) {
11910 Base = V2;
11911 } else {
11912 continue;
11913 }
11914
11915 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11916 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11917 V1 = Base;
11918 V2 = Insert;
11919 return true;
11920 }
11921 }
11922
11923 return false;
11924}
11925
11926/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11927static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
11928 SDValue V2, ArrayRef<int> Mask,
11929 const APInt &Zeroable, SelectionDAG &DAG) {
11930 uint64_t BitLen, BitIdx;
11931 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11932 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11933 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11934 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11935
11936 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11937 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11938 V2 ? V2 : DAG.getUNDEF(VT),
11939 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11940 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11941
11942 return SDValue();
11943}
11944
11945/// Lower a vector shuffle as a zero or any extension.
11946///
11947/// Given a specific number of elements, element bit width, and extension
11948/// stride, produce either a zero or any extension based on the available
11949/// features of the subtarget. The extended elements are consecutive and
11950/// begin and can start from an offsetted element index in the input; to
11951/// avoid excess shuffling the offset must either being in the bottom lane
11952/// or at the start of a higher lane. All extended elements must be from
11953/// the same lane.
11954static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
11955 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11956 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11957 assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11957, __PRETTY_FUNCTION__))
;
11958 int EltBits = VT.getScalarSizeInBits();
11959 int NumElements = VT.getVectorNumElements();
11960 int NumEltsPerLane = 128 / EltBits;
11961 int OffsetLane = Offset / NumEltsPerLane;
11962 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11963, __PRETTY_FUNCTION__))
11963 "Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11963, __PRETTY_FUNCTION__))
;
11964 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11964, __PRETTY_FUNCTION__))
;
11965 assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive."
) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11965, __PRETTY_FUNCTION__))
;
11966 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11967, __PRETTY_FUNCTION__))
11967 "Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11967, __PRETTY_FUNCTION__))
;
11968
11969 // Check that an index is in same lane as the base offset.
11970 auto SafeOffset = [&](int Idx) {
11971 return OffsetLane == (Idx / NumEltsPerLane);
11972 };
11973
11974 // Shift along an input so that the offset base moves to the first element.
11975 auto ShuffleOffset = [&](SDValue V) {
11976 if (!Offset)
11977 return V;
11978
11979 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11980 for (int i = 0; i * Scale < NumElements; ++i) {
11981 int SrcIdx = i + Offset;
11982 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11983 }
11984 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11985 };
11986
11987 // Found a valid a/zext mask! Try various lowering strategies based on the
11988 // input type and available ISA extensions.
11989 if (Subtarget.hasSSE41()) {
11990 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11991 // PUNPCK will catch this in a later shuffle match.
11992 if (Offset && Scale == 2 && VT.is128BitVector())
11993 return SDValue();
11994 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11995 NumElements / Scale);
11996 InputV = ShuffleOffset(InputV);
11997 InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
11998 ExtVT, InputV, DAG);
11999 return DAG.getBitcast(VT, InputV);
12000 }
12001
12002 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12002, __PRETTY_FUNCTION__))
;
12003
12004 // For any extends we can cheat for larger element sizes and use shuffle
12005 // instructions that can fold with a load and/or copy.
12006 if (AnyExt && EltBits == 32) {
12007 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12008 -1};
12009 return DAG.getBitcast(
12010 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12011 DAG.getBitcast(MVT::v4i32, InputV),
12012 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12013 }
12014 if (AnyExt && EltBits == 16 && Scale > 2) {
12015 int PSHUFDMask[4] = {Offset / 2, -1,
12016 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12017 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12018 DAG.getBitcast(MVT::v4i32, InputV),
12019 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12020 int PSHUFWMask[4] = {1, -1, -1, -1};
12021 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12022 return DAG.getBitcast(
12023 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12024 DAG.getBitcast(MVT::v8i16, InputV),
12025 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12026 }
12027
12028 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12029 // to 64-bits.
12030 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12031 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12031, __PRETTY_FUNCTION__))
;
12032 assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12032, __PRETTY_FUNCTION__))
;
12033
12034 int LoIdx = Offset * EltBits;
12035 SDValue Lo = DAG.getBitcast(
12036 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12037 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12038 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12039
12040 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12041 return DAG.getBitcast(VT, Lo);
12042
12043 int HiIdx = (Offset + 1) * EltBits;
12044 SDValue Hi = DAG.getBitcast(
12045 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12046 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12047 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12048 return DAG.getBitcast(VT,
12049 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12050 }
12051
12052 // If this would require more than 2 unpack instructions to expand, use
12053 // pshufb when available. We can only use more than 2 unpack instructions
12054 // when zero extending i8 elements which also makes it easier to use pshufb.
12055 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12056 assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12056, __PRETTY_FUNCTION__))
;
12057 SDValue PSHUFBMask[16];
12058 for (int i = 0; i < 16; ++i) {
12059 int Idx = Offset + (i / Scale);
12060 if ((i % Scale == 0 && SafeOffset(Idx))) {
12061 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12062 continue;
12063 }
12064 PSHUFBMask[i] =
12065 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12066 }
12067 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12068 return DAG.getBitcast(
12069 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12070 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12071 }
12072
12073 // If we are extending from an offset, ensure we start on a boundary that
12074 // we can unpack from.
12075 int AlignToUnpack = Offset % (NumElements / Scale);
12076 if (AlignToUnpack) {
12077 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12078 for (int i = AlignToUnpack; i < NumElements; ++i)
12079 ShMask[i - AlignToUnpack] = i;
12080 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12081 Offset -= AlignToUnpack;
12082 }
12083
12084 // Otherwise emit a sequence of unpacks.
12085 do {
12086 unsigned UnpackLoHi = X86ISD::UNPCKL;
12087 if (Offset >= (NumElements / 2)) {
12088 UnpackLoHi = X86ISD::UNPCKH;
12089 Offset -= (NumElements / 2);
12090 }
12091
12092 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12093 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12094 : getZeroVector(InputVT, Subtarget, DAG, DL);
12095 InputV = DAG.getBitcast(InputVT, InputV);
12096 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12097 Scale /= 2;
12098 EltBits *= 2;
12099 NumElements /= 2;
12100 } while (Scale > 1);
12101 return DAG.getBitcast(VT, InputV);
12102}
12103
12104/// Try to lower a vector shuffle as a zero extension on any microarch.
12105///
12106/// This routine will try to do everything in its power to cleverly lower
12107/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12108/// check for the profitability of this lowering, it tries to aggressively
12109/// match this pattern. It will use all of the micro-architectural details it
12110/// can to emit an efficient lowering. It handles both blends with all-zero
12111/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12112/// masking out later).
12113///
12114/// The reason we have dedicated lowering for zext-style shuffles is that they
12115/// are both incredibly common and often quite performance sensitive.
12116static SDValue lowerShuffleAsZeroOrAnyExtend(
12117 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12118 const APInt &Zeroable, const X86Subtarget &Subtarget,
12119 SelectionDAG &DAG) {
12120 int Bits = VT.getSizeInBits();
12121 int NumLanes = Bits / 128;
12122 int NumElements = VT.getVectorNumElements();
12123 int NumEltsPerLane = NumElements / NumLanes;
12124 assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12125, __PRETTY_FUNCTION__))
12125 "Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12125, __PRETTY_FUNCTION__))
;
12126 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12126, __PRETTY_FUNCTION__))
;
12127
12128 // Define a helper function to check a particular ext-scale and lower to it if
12129 // valid.
12130 auto Lower = [&](int Scale) -> SDValue {
12131 SDValue InputV;
12132 bool AnyExt = true;
12133 int Offset = 0;
12134 int Matches = 0;
12135 for (int i = 0; i < NumElements; ++i) {
12136 int M = Mask[i];
12137 if (M < 0)
12138 continue; // Valid anywhere but doesn't tell us anything.
12139 if (i % Scale != 0) {
12140 // Each of the extended elements need to be zeroable.
12141 if (!Zeroable[i])
12142 return SDValue();
12143
12144 // We no longer are in the anyext case.
12145 AnyExt = false;
12146 continue;
12147 }
12148
12149 // Each of the base elements needs to be consecutive indices into the
12150 // same input vector.
12151 SDValue V = M < NumElements ? V1 : V2;
12152 M = M % NumElements;
12153 if (!InputV) {
12154 InputV = V;
12155 Offset = M - (i / Scale);
12156 } else if (InputV != V)
12157 return SDValue(); // Flip-flopping inputs.
12158
12159 // Offset must start in the lowest 128-bit lane or at the start of an
12160 // upper lane.
12161 // FIXME: Is it ever worth allowing a negative base offset?
12162 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12163 (Offset % NumEltsPerLane) == 0))
12164 return SDValue();
12165
12166 // If we are offsetting, all referenced entries must come from the same
12167 // lane.
12168 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12169 return SDValue();
12170
12171 if ((M % NumElements) != (Offset + (i / Scale)))
12172 return SDValue(); // Non-consecutive strided elements.
12173 Matches++;
12174 }
12175
12176 // If we fail to find an input, we have a zero-shuffle which should always
12177 // have already been handled.
12178 // FIXME: Maybe handle this here in case during blending we end up with one?
12179 if (!InputV)
12180 return SDValue();
12181
12182 // If we are offsetting, don't extend if we only match a single input, we
12183 // can always do better by using a basic PSHUF or PUNPCK.
12184 if (Offset != 0 && Matches < 2)
12185 return SDValue();
12186
12187 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12188 InputV, Mask, Subtarget, DAG);
12189 };
12190
12191 // The widest scale possible for extending is to a 64-bit integer.
12192 assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12193, __PRETTY_FUNCTION__))
12193 "The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12193, __PRETTY_FUNCTION__))
;
12194 int NumExtElements = Bits / 64;
12195
12196 // Each iteration, try extending the elements half as much, but into twice as
12197 // many elements.
12198 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12199 assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12200, __PRETTY_FUNCTION__))
12200 "The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12200, __PRETTY_FUNCTION__))
;
12201 if (SDValue V = Lower(NumElements / NumExtElements))
12202 return V;
12203 }
12204
12205 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12206 if (Bits != 128)
12207 return SDValue();
12208
12209 // Returns one of the source operands if the shuffle can be reduced to a
12210 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12211 auto CanZExtLowHalf = [&]() {
12212 for (int i = NumElements / 2; i != NumElements; ++i)
12213 if (!Zeroable[i])
12214 return SDValue();
12215 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12216 return V1;
12217 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12218 return V2;
12219 return SDValue();
12220 };
12221
12222 if (SDValue V = CanZExtLowHalf()) {
12223 V = DAG.getBitcast(MVT::v2i64, V);
12224 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12225 return DAG.getBitcast(VT, V);
12226 }
12227
12228 // No viable ext lowering found.
12229 return SDValue();
12230}
12231
12232/// Try to get a scalar value for a specific element of a vector.
12233///
12234/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12235static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
12236 SelectionDAG &DAG) {
12237 MVT VT = V.getSimpleValueType();
12238 MVT EltVT = VT.getVectorElementType();
12239 V = peekThroughBitcasts(V);
12240
12241 // If the bitcasts shift the element size, we can't extract an equivalent
12242 // element from it.
12243 MVT NewVT = V.getSimpleValueType();
12244 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12245 return SDValue();
12246
12247 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12248 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12249 // Ensure the scalar operand is the same size as the destination.
12250 // FIXME: Add support for scalar truncation where possible.
12251 SDValue S = V.getOperand(Idx);
12252 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12253 return DAG.getBitcast(EltVT, S);
12254 }
12255
12256 return SDValue();
12257}
12258
12259/// Helper to test for a load that can be folded with x86 shuffles.
12260///
12261/// This is particularly important because the set of instructions varies
12262/// significantly based on whether the operand is a load or not.
12263static bool isShuffleFoldableLoad(SDValue V) {
12264 V = peekThroughBitcasts(V);
12265 return ISD::isNON_EXTLoad(V.getNode());
12266}
12267
12268/// Try to lower insertion of a single element into a zero vector.
12269///
12270/// This is a common pattern that we have especially efficient patterns to lower
12271/// across all subtarget feature sets.
12272static SDValue lowerShuffleAsElementInsertion(
12273 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12274 const APInt &Zeroable, const X86Subtarget &Subtarget,
12275 SelectionDAG &DAG) {
12276 MVT ExtVT = VT;
12277 MVT EltVT = VT.getVectorElementType();
12278
12279 int V2Index =
12280 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12281 Mask.begin();
12282 bool IsV1Zeroable = true;
12283 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12284 if (i != V2Index && !Zeroable[i]) {
12285 IsV1Zeroable = false;
12286 break;
12287 }
12288
12289 // Check for a single input from a SCALAR_TO_VECTOR node.
12290 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12291 // all the smarts here sunk into that routine. However, the current
12292 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12293 // vector shuffle lowering is dead.
12294 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12295 DAG);
12296 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12297 // We need to zext the scalar if it is smaller than an i32.
12298 V2S = DAG.getBitcast(EltVT, V2S);
12299 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
12300 // Using zext to expand a narrow element won't work for non-zero
12301 // insertions.
12302 if (!IsV1Zeroable)
12303 return SDValue();
12304
12305 // Zero-extend directly to i32.
12306 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12307 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12308 }
12309 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12310 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12311 EltVT == MVT::i16) {
12312 // Either not inserting from the low element of the input or the input
12313 // element size is too small to use VZEXT_MOVL to clear the high bits.
12314 return SDValue();
12315 }
12316
12317 if (!IsV1Zeroable) {
12318 // If V1 can't be treated as a zero vector we have fewer options to lower
12319 // this. We can't support integer vectors or non-zero targets cheaply, and
12320 // the V1 elements can't be permuted in any way.
12321 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12321, __PRETTY_FUNCTION__))
;
12322 if (!VT.isFloatingPoint() || V2Index != 0)
12323 return SDValue();
12324 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
12325 V1Mask[V2Index] = -1;
12326 if (!isNoopShuffleMask(V1Mask))
12327 return SDValue();
12328 if (!VT.is128BitVector())
12329 return SDValue();
12330
12331 // Otherwise, use MOVSD or MOVSS.
12332 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12333, __PRETTY_FUNCTION__))
12333 "Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12333, __PRETTY_FUNCTION__))
;
12334 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
12335 ExtVT, V1, V2);
12336 }
12337
12338 // This lowering only works for the low element with floating point vectors.
12339 if (VT.isFloatingPoint() && V2Index != 0)
12340 return SDValue();
12341
12342 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12343 if (ExtVT != VT)
12344 V2 = DAG.getBitcast(VT, V2);
12345
12346 if (V2Index != 0) {
12347 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12348 // the desired position. Otherwise it is more efficient to do a vector
12349 // shift left. We know that we can do a vector shift left because all
12350 // the inputs are zero.
12351 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
12352 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12353 V2Shuffle[V2Index] = 0;
12354 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12355 } else {
12356 V2 = DAG.getBitcast(MVT::v16i8, V2);
12357 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12358 DAG.getTargetConstant(
12359 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
12360 V2 = DAG.getBitcast(VT, V2);
12361 }
12362 }
12363 return V2;
12364}
12365
12366/// Try to lower broadcast of a single - truncated - integer element,
12367/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12368///
12369/// This assumes we have AVX2.
12370static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
12371 int BroadcastIdx,
12372 const X86Subtarget &Subtarget,
12373 SelectionDAG &DAG) {
12374 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12375, __PRETTY_FUNCTION__))
12375 "We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12375, __PRETTY_FUNCTION__))
;
12376
12377 EVT EltVT = VT.getVectorElementType();
12378 EVT V0VT = V0.getValueType();
12379
12380 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12380, __PRETTY_FUNCTION__))
;
12381 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12381, __PRETTY_FUNCTION__))
;
12382
12383 EVT V0EltVT = V0VT.getVectorElementType();
12384 if (!V0EltVT.isInteger())
12385 return SDValue();
12386
12387 const unsigned EltSize = EltVT.getSizeInBits();
12388 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12389
12390 // This is only a truncation if the original element type is larger.
12391 if (V0EltSize <= EltSize)
12392 return SDValue();
12393
12394 assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12395, __PRETTY_FUNCTION__))
12395 "Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12395, __PRETTY_FUNCTION__))
;
12396
12397 const unsigned V0Opc = V0.getOpcode();
12398 const unsigned Scale = V0EltSize / EltSize;
12399 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12400
12401 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12402 V0Opc != ISD::BUILD_VECTOR)
12403 return SDValue();
12404
12405 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12406
12407 // If we're extracting non-least-significant bits, shift so we can truncate.
12408 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12409 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12410 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12411 if (const int OffsetIdx = BroadcastIdx % Scale)
12412 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12413 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12414
12415 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12416 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12417}
12418
12419/// Test whether this can be lowered with a single SHUFPS instruction.
12420///
12421/// This is used to disable more specialized lowerings when the shufps lowering
12422/// will happen to be efficient.
12423static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12424 // This routine only handles 128-bit shufps.
12425 assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12425, __PRETTY_FUNCTION__))
;
12426 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12426, __PRETTY_FUNCTION__))
;
12427 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12427, __PRETTY_FUNCTION__))
;
12428 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12428, __PRETTY_FUNCTION__))
;
12429 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12429, __PRETTY_FUNCTION__))
;
12430
12431 // To lower with a single SHUFPS we need to have the low half and high half
12432 // each requiring a single input.
12433 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12434 return false;
12435 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12436 return false;
12437
12438 return true;
12439}
12440
12441/// If we are extracting two 128-bit halves of a vector and shuffling the
12442/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12443/// multi-shuffle lowering.
12444static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
12445 SDValue N1, ArrayRef<int> Mask,
12446 SelectionDAG &DAG) {
12447 EVT VT = N0.getValueType();
12448 assert((VT.is128BitVector() &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12450, __PRETTY_FUNCTION__))
12449 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12450, __PRETTY_FUNCTION__))
12450 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12450, __PRETTY_FUNCTION__))
;
12451
12452 // Check that both sources are extracts of the same source vector.
12453 if (!N0.hasOneUse() || !N1.hasOneUse() ||
12454 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12455 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12456 N0.getOperand(0) != N1.getOperand(0))
12457 return SDValue();
12458
12459 SDValue WideVec = N0.getOperand(0);
12460 EVT WideVT = WideVec.getValueType();
12461 if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
12462 !isa<ConstantSDNode>(N1.getOperand(1)))
12463 return SDValue();
12464
12465 // Match extracts of each half of the wide source vector. Commute the shuffle
12466 // if the extract of the low half is N1.
12467 unsigned NumElts = VT.getVectorNumElements();
12468 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
12469 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12470 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12471 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12472 ShuffleVectorSDNode::commuteMask(NewMask);
12473 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12474 return SDValue();
12475
12476 // Final bailout: if the mask is simple, we are better off using an extract
12477 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12478 // because that avoids a constant load from memory.
12479 if (NumElts == 4 &&
12480 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
12481 return SDValue();
12482
12483 // Extend the shuffle mask with undef elements.
12484 NewMask.append(NumElts, -1);
12485
12486 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12487 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12488 NewMask);
12489 // This is free: ymm -> xmm.
12490 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12491 DAG.getIntPtrConstant(0, DL));
12492}
12493
12494/// Try to lower broadcast of a single element.
12495///
12496/// For convenience, this code also bundles all of the subtarget feature set
12497/// filtering. While a little annoying to re-dispatch on type here, there isn't
12498/// a convenient way to factor it out.
12499static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
12500 SDValue V2, ArrayRef<int> Mask,
12501 const X86Subtarget &Subtarget,
12502 SelectionDAG &DAG) {
12503 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12504 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
12505 (Subtarget.hasAVX2() && VT.isInteger())))
12506 return SDValue();
12507
12508 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12509 // we can only broadcast from a register with AVX2.
12510 unsigned NumElts = Mask.size();
12511 unsigned NumEltBits = VT.getScalarSizeInBits();
12512 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12513 ? X86ISD::MOVDDUP
12514 : X86ISD::VBROADCAST;
12515 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12516
12517 // Check that the mask is a broadcast.
12518 int BroadcastIdx = -1;
12519 for (int i = 0; i != (int)NumElts; ++i) {
12520 SmallVector<int, 8> BroadcastMask(NumElts, i);
12521 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
12522 BroadcastIdx = i;
12523 break;
12524 }
12525 }
12526
12527 if (BroadcastIdx < 0)
12528 return SDValue();
12529 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12531, __PRETTY_FUNCTION__))
12530 "a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12531, __PRETTY_FUNCTION__))
12531 "comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12531, __PRETTY_FUNCTION__))
;
12532
12533 // Go up the chain of (vector) values to find a scalar load that we can
12534 // combine with the broadcast.
12535 int BitOffset = BroadcastIdx * NumEltBits;
12536 SDValue V = V1;
12537 for (;;) {
12538 switch (V.getOpcode()) {
12539 case ISD::BITCAST: {
12540 V = V.getOperand(0);
12541 continue;
12542 }
12543 case ISD::CONCAT_VECTORS: {
12544 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12545 int OpIdx = BitOffset / OpBitWidth;
12546 V = V.getOperand(OpIdx);
12547 BitOffset %= OpBitWidth;
12548 continue;
12549 }
12550 case ISD::INSERT_SUBVECTOR: {
12551 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12552 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
12553 if (!ConstantIdx)
12554 break;
12555
12556 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12557 int Idx = (int)ConstantIdx->getZExtValue();
12558 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12559 int BeginOffset = Idx * EltBitWidth;
12560 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12561 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12562 BitOffset -= BeginOffset;
12563 V = VInner;
12564 } else {
12565 V = VOuter;
12566 }
12567 continue;
12568 }
12569 }
12570 break;
12571 }
12572 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12572, __PRETTY_FUNCTION__))
;
12573 BroadcastIdx = BitOffset / NumEltBits;
12574
12575 // Do we need to bitcast the source to retrieve the original broadcast index?
12576 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12577
12578 // Check if this is a broadcast of a scalar. We special case lowering
12579 // for scalars so that we can more effectively fold with loads.
12580 // If the original value has a larger element type than the shuffle, the
12581 // broadcast element is in essence truncated. Make that explicit to ease
12582 // folding.
12583 if (BitCastSrc && VT.isInteger())
12584 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12585 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12586 return TruncBroadcast;
12587
12588 MVT BroadcastVT = VT;
12589
12590 // Also check the simpler case, where we can directly reuse the scalar.
12591 if (!BitCastSrc &&
12592 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12593 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12594 V = V.getOperand(BroadcastIdx);
12595
12596 // If we can't broadcast from a register, check that the input is a load.
12597 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12598 return SDValue();
12599 } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
12600 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
12601 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
12602 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
12603 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
12604 ? X86ISD::MOVDDUP
12605 : Opcode;
12606 }
12607
12608 // If we are broadcasting a load that is only used by the shuffle
12609 // then we can reduce the vector load to the broadcasted scalar load.
12610 LoadSDNode *Ld = cast<LoadSDNode>(V);
12611 SDValue BaseAddr = Ld->getOperand(1);
12612 EVT SVT = BroadcastVT.getScalarType();
12613 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12614 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12614, __PRETTY_FUNCTION__))
;
12615 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
12616 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12617 DAG.getMachineFunction().getMachineMemOperand(
12618 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12619 DAG.makeEquivalentMemoryOrdering(Ld, V);
12620 } else if (!BroadcastFromReg) {
12621 // We can't broadcast from a vector register.
12622 return SDValue();
12623 } else if (BitOffset != 0) {
12624 // We can only broadcast from the zero-element of a vector register,
12625 // but it can be advantageous to broadcast from the zero-element of a
12626 // subvector.
12627 if (!VT.is256BitVector() && !VT.is512BitVector())
12628 return SDValue();
12629
12630 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12631 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12632 return SDValue();
12633
12634 // Only broadcast the zero-element of a 128-bit subvector.
12635 if ((BitOffset % 128) != 0)
12636 return SDValue();
12637
12638 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12639, __PRETTY_FUNCTION__))
12639 "Unexpected bit-offset")(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12639, __PRETTY_FUNCTION__))
;
12640 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12641, __PRETTY_FUNCTION__))
12641 "Unexpected vector size")(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12641, __PRETTY_FUNCTION__))
;
12642 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12643 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12644 }
12645
12646 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
12647 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
12648 DAG.getBitcast(MVT::f64, V));
12649
12650 // Bitcast back to the same scalar type as BroadcastVT.
12651 if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
12652 assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&((NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"
) ? static_cast<void> (0) : __assert_fail ("NumEltBits == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12653, __PRETTY_FUNCTION__))
12653 "Unexpected vector element size")((NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"
) ? static_cast<void> (0) : __assert_fail ("NumEltBits == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12653, __PRETTY_FUNCTION__))
;
12654 MVT ExtVT;
12655 if (V.getValueType().isVector()) {
12656 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12657 ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
12658 } else {
12659 ExtVT = BroadcastVT.getScalarType();
12660 }
12661 V = DAG.getBitcast(ExtVT, V);
12662 }
12663
12664 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
12665 if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
12666 V = DAG.getBitcast(MVT::f64, V);
12667 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
12668 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
12669 }
12670
12671 // We only support broadcasting from 128-bit vectors to minimize the
12672 // number of patterns we need to deal with in isel. So extract down to
12673 // 128-bits, removing as many bitcasts as possible.
12674 if (V.getValueSizeInBits() > 128) {
12675 MVT ExtVT = V.getSimpleValueType().getScalarType();
12676 ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
12677 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
12678 V = DAG.getBitcast(ExtVT, V);
12679 }
12680
12681 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12682}
12683
12684// Check for whether we can use INSERTPS to perform the shuffle. We only use
12685// INSERTPS when the V1 elements are already in the correct locations
12686// because otherwise we can just always use two SHUFPS instructions which
12687// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12688// perform INSERTPS if a single V1 element is out of place and all V2
12689// elements are zeroable.
12690static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
12691 unsigned &InsertPSMask,
12692 const APInt &Zeroable,
12693 ArrayRef<int> Mask, SelectionDAG &DAG) {
12694 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12694, __PRETTY_FUNCTION__))
;
12695 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12695, __PRETTY_FUNCTION__))
;
12696 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12696, __PRETTY_FUNCTION__))
;
12697
12698 // Attempt to match INSERTPS with one element from VA or VB being
12699 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12700 // are updated.
12701 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12702 ArrayRef<int> CandidateMask) {
12703 unsigned ZMask = 0;
12704 int VADstIndex = -1;
12705 int VBDstIndex = -1;
12706 bool VAUsedInPlace = false;
12707
12708 for (int i = 0; i < 4; ++i) {
12709 // Synthesize a zero mask from the zeroable elements (includes undefs).
12710 if (Zeroable[i]) {
12711 ZMask |= 1 << i;
12712 continue;
12713 }
12714
12715 // Flag if we use any VA inputs in place.
12716 if (i == CandidateMask[i]) {
12717 VAUsedInPlace = true;
12718 continue;
12719 }
12720
12721 // We can only insert a single non-zeroable element.
12722 if (VADstIndex >= 0 || VBDstIndex >= 0)
12723 return false;
12724
12725 if (CandidateMask[i] < 4) {
12726 // VA input out of place for insertion.
12727 VADstIndex = i;
12728 } else {
12729 // VB input for insertion.
12730 VBDstIndex = i;
12731 }
12732 }
12733
12734 // Don't bother if we have no (non-zeroable) element for insertion.
12735 if (VADstIndex < 0 && VBDstIndex < 0)
12736 return false;
12737
12738 // Determine element insertion src/dst indices. The src index is from the
12739 // start of the inserted vector, not the start of the concatenated vector.
12740 unsigned VBSrcIndex = 0;
12741 if (VADstIndex >= 0) {
12742 // If we have a VA input out of place, we use VA as the V2 element
12743 // insertion and don't use the original V2 at all.
12744 VBSrcIndex = CandidateMask[VADstIndex];
12745 VBDstIndex = VADstIndex;
12746 VB = VA;
12747 } else {
12748 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12749 }
12750
12751 // If no V1 inputs are used in place, then the result is created only from
12752 // the zero mask and the V2 insertion - so remove V1 dependency.
12753 if (!VAUsedInPlace)
12754 VA = DAG.getUNDEF(MVT::v4f32);
12755
12756 // Update V1, V2 and InsertPSMask accordingly.
12757 V1 = VA;
12758 V2 = VB;
12759
12760 // Insert the V2 element into the desired position.
12761 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12762 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12762, __PRETTY_FUNCTION__))
;
12763 return true;
12764 };
12765
12766 if (matchAsInsertPS(V1, V2, Mask))
12767 return true;
12768
12769 // Commute and try again.
12770 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
12771 ShuffleVectorSDNode::commuteMask(CommutedMask);
12772 if (matchAsInsertPS(V2, V1, CommutedMask))
12773 return true;
12774
12775 return false;
12776}
12777
12778static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
12779 ArrayRef<int> Mask, const APInt &Zeroable,
12780 SelectionDAG &DAG) {
12781 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12781, __PRETTY_FUNCTION__))
;
12782 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12782, __PRETTY_FUNCTION__))
;
12783
12784 // Attempt to match the insertps pattern.
12785 unsigned InsertPSMask;
12786 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12787 return SDValue();
12788
12789 // Insert the V2 element into the desired position.
12790 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12791 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12792}
12793
12794/// Try to lower a shuffle as a permute of the inputs followed by an
12795/// UNPCK instruction.
12796///
12797/// This specifically targets cases where we end up with alternating between
12798/// the two inputs, and so can permute them into something that feeds a single
12799/// UNPCK instruction. Note that this routine only targets integer vectors
12800/// because for floating point vectors we have a generalized SHUFPS lowering
12801/// strategy that handles everything that doesn't *exactly* match an unpack,
12802/// making this clever lowering unnecessary.
12803static SDValue lowerShuffleAsPermuteAndUnpack(
12804 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12805 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12806 assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12807, __PRETTY_FUNCTION__))
12807 "This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12807, __PRETTY_FUNCTION__))
;
12808 assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12809, __PRETTY_FUNCTION__))
12809 "This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12809, __PRETTY_FUNCTION__))
;
12810 assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12811, __PRETTY_FUNCTION__))
12811 "This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12811, __PRETTY_FUNCTION__))
;
12812 assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid."
) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12812, __PRETTY_FUNCTION__))
;
12813
12814 int Size = Mask.size();
12815
12816 int NumLoInputs =
12817 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
12818 int NumHiInputs =
12819 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
12820
12821 bool UnpackLo = NumLoInputs >= NumHiInputs;
12822
12823 auto TryUnpack = [&](int ScalarSize, int Scale) {
12824 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
12825 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
12826
12827 for (int i = 0; i < Size; ++i) {
12828 if (Mask[i] < 0)
12829 continue;
12830
12831 // Each element of the unpack contains Scale elements from this mask.
12832 int UnpackIdx = i / Scale;
12833
12834 // We only handle the case where V1 feeds the first slots of the unpack.
12835 // We rely on canonicalization to ensure this is the case.
12836 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
12837 return SDValue();
12838
12839 // Setup the mask for this input. The indexing is tricky as we have to
12840 // handle the unpack stride.
12841 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
12842 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
12843 Mask[i] % Size;
12844 }
12845
12846 // If we will have to shuffle both inputs to use the unpack, check whether
12847 // we can just unpack first and shuffle the result. If so, skip this unpack.
12848 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
12849 !isNoopShuffleMask(V2Mask))
12850 return SDValue();
12851
12852 // Shuffle the inputs into place.
12853 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12854 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12855
12856 // Cast the inputs to the type we will use to unpack them.
12857 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
12858 V1 = DAG.getBitcast(UnpackVT, V1);
12859 V2 = DAG.getBitcast(UnpackVT, V2);
12860
12861 // Unpack the inputs and cast the result back to the desired type.
12862 return DAG.getBitcast(
12863 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12864 UnpackVT, V1, V2));
12865 };
12866
12867 // We try each unpack from the largest to the smallest to try and find one
12868 // that fits this mask.
12869 int OrigScalarSize = VT.getScalarSizeInBits();
12870 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12871 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12872 return Unpack;
12873
12874 // If we're shuffling with a zero vector then we're better off not doing
12875 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
12876 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
12877 ISD::isBuildVectorAllZeros(V2.getNode()))
12878 return SDValue();
12879
12880 // If none of the unpack-rooted lowerings worked (or were profitable) try an
12881 // initial unpack.
12882 if (NumLoInputs == 0 || NumHiInputs == 0) {
12883 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12884, __PRETTY_FUNCTION__))
12884 "We have to have *some* inputs!")(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12884, __PRETTY_FUNCTION__))
;
12885 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
12886
12887 // FIXME: We could consider the total complexity of the permute of each
12888 // possible unpacking. Or at the least we should consider how many
12889 // half-crossings are created.
12890 // FIXME: We could consider commuting the unpacks.
12891
12892 SmallVector<int, 32> PermMask((unsigned)Size, -1);
12893 for (int i = 0; i < Size; ++i) {
12894 if (Mask[i] < 0)
12895 continue;
12896
12897 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12897, __PRETTY_FUNCTION__))
;
12898
12899 PermMask[i] =
12900 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
12901 }
12902 return DAG.getVectorShuffle(
12903 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
12904 DL, VT, V1, V2),
12905 DAG.getUNDEF(VT), PermMask);
12906 }
12907
12908 return SDValue();
12909}
12910
12911/// Handle lowering of 2-lane 64-bit floating point shuffles.
12912///
12913/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12914/// support for floating point shuffles but not integer shuffles. These
12915/// instructions will incur a domain crossing penalty on some chips though so
12916/// it is better to avoid lowering through this for integer vectors where
12917/// possible.
12918static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12919 const APInt &Zeroable, SDValue V1, SDValue V2,
12920 const X86Subtarget &Subtarget,
12921 SelectionDAG &DAG) {
12922 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12922, __PRETTY_FUNCTION__))
;
12923 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12923, __PRETTY_FUNCTION__))
;
12924 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12924, __PRETTY_FUNCTION__))
;
12925
12926 if (V2.isUndef()) {
12927 // Check for being able to broadcast a single element.
12928 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12929 Mask, Subtarget, DAG))
12930 return Broadcast;
12931
12932 // Straight shuffle of a single input vector. Simulate this by using the
12933 // single input as both of the "inputs" to this instruction..
12934 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12935
12936 if (Subtarget.hasAVX()) {
12937 // If we have AVX, we can use VPERMILPS which will allow folding a load
12938 // into the shuffle.
12939 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12940 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12941 }
12942
12943 return DAG.getNode(
12944 X86ISD::SHUFP, DL, MVT::v2f64,
12945 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12946 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12947 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12948 }
12949 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12949, __PRETTY_FUNCTION__))
;
12950 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12950, __PRETTY_FUNCTION__))
;
12951 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12951, __PRETTY_FUNCTION__))
;
12952 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12952, __PRETTY_FUNCTION__))
;
12953
12954 if (Subtarget.hasAVX2())
12955 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12956 return Extract;
12957
12958 // When loading a scalar and then shuffling it into a vector we can often do
12959 // the insertion cheaply.
12960 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12961 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12962 return Insertion;
12963 // Try inverting the insertion since for v2 masks it is easy to do and we
12964 // can't reliably sort the mask one way or the other.
12965 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12966 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12967 if (SDValue Insertion = lowerShuffleAsElementInsertion(
12968 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12969 return Insertion;
12970
12971 // Try to use one of the special instruction patterns to handle two common
12972 // blend patterns if a zero-blend above didn't work.
12973 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
12974 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
12975 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12976 // We can either use a special instruction to load over the low double or
12977 // to move just the low double.
12978 return DAG.getNode(
12979 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12980 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12981
12982 if (Subtarget.hasSSE41())
12983 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12984 Zeroable, Subtarget, DAG))
12985 return Blend;
12986
12987 // Use dedicated unpack instructions for masks that match their pattern.
12988 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12989 return V;
12990
12991 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12992 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12993 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12994}
12995
12996/// Handle lowering of 2-lane 64-bit integer shuffles.
12997///
12998/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12999/// the integer unit to minimize domain crossing penalties. However, for blends
13000/// it falls back to the floating point shuffle operation with appropriate bit
13001/// casting.
13002static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13003 const APInt &Zeroable, SDValue V1, SDValue V2,
13004 const X86Subtarget &Subtarget,
13005 SelectionDAG &DAG) {
13006 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13006, __PRETTY_FUNCTION__))
;
13007 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13007, __PRETTY_FUNCTION__))
;
13008 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13008, __PRETTY_FUNCTION__))
;
13009
13010 if (V2.isUndef()) {
13011 // Check for being able to broadcast a single element.
13012 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13013 Mask, Subtarget, DAG))
13014 return Broadcast;
13015
13016 // Straight shuffle of a single input vector. For everything from SSE2
13017 // onward this has a single fast instruction with no scary immediates.
13018 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13019 V1 = DAG.getBitcast(MVT::v4i32, V1);
13020 int WidenedMask[4] = {
13021 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
13022 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
13023 return DAG.getBitcast(
13024 MVT::v2i64,
13025 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13026 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13027 }
13028 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13028, __PRETTY_FUNCTION__))
;
13029 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13029, __PRETTY_FUNCTION__))
;
13030 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13030, __PRETTY_FUNCTION__))
;
13031 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13031, __PRETTY_FUNCTION__))
;
13032
13033 if (Subtarget.hasAVX2())
13034 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13035 return Extract;
13036
13037 // Try to use shift instructions.
13038 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
13039 Zeroable, Subtarget, DAG))
13040 return Shift;
13041
13042 // When loading a scalar and then shuffling it into a vector we can often do
13043 // the insertion cheaply.
13044 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13045 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13046 return Insertion;
13047 // Try inverting the insertion since for v2 masks it is easy to do and we
13048 // can't reliably sort the mask one way or the other.
13049 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13050 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13051 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13052 return Insertion;
13053
13054 // We have different paths for blend lowering, but they all must use the
13055 // *exact* same predicate.
13056 bool IsBlendSupported = Subtarget.hasSSE41();
13057 if (IsBlendSupported)
13058 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13059 Zeroable, Subtarget, DAG))
13060 return Blend;
13061
13062 // Use dedicated unpack instructions for masks that match their pattern.
13063 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
13064 return V;
13065
13066 // Try to use byte rotation instructions.
13067 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13068 if (Subtarget.hasSSSE3()) {
13069 if (Subtarget.hasVLX())
13070 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
13071 Subtarget, DAG))
13072 return Rotate;
13073
13074 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13075 Subtarget, DAG))
13076 return Rotate;
13077 }
13078
13079 // If we have direct support for blends, we should lower by decomposing into
13080 // a permute. That will be faster than the domain cross.
13081 if (IsBlendSupported)
13082 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
13083 Subtarget, DAG);
13084
13085 // We implement this with SHUFPD which is pretty lame because it will likely
13086 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13087 // However, all the alternatives are still more cycles and newer chips don't
13088 // have this problem. It would be really nice if x86 had better shuffles here.
13089 V1 = DAG.getBitcast(MVT::v2f64, V1);
13090 V2 = DAG.getBitcast(MVT::v2f64, V2);
13091 return DAG.getBitcast(MVT::v2i64,
13092 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13093}
13094
13095/// Lower a vector shuffle using the SHUFPS instruction.
13096///
13097/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13098/// It makes no assumptions about whether this is the *best* lowering, it simply
13099/// uses it.
13100static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
13101 ArrayRef<int> Mask, SDValue V1,
13102 SDValue V2, SelectionDAG &DAG) {
13103 SDValue LowV = V1, HighV = V2;
13104 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
13105
13106 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13107
13108 if (NumV2Elements == 1) {
13109 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13110
13111 // Compute the index adjacent to V2Index and in the same half by toggling
13112 // the low bit.
13113 int V2AdjIndex = V2Index ^ 1;
13114
13115 if (Mask[V2AdjIndex] < 0) {
13116 // Handles all the cases where we have a single V2 element and an undef.
13117 // This will only ever happen in the high lanes because we commute the
13118 // vector otherwise.
13119 if (V2Index < 2)
13120 std::swap(LowV, HighV);
13121 NewMask[V2Index] -= 4;
13122 } else {
13123 // Handle the case where the V2 element ends up adjacent to a V1 element.
13124 // To make this work, blend them together as the first step.
13125 int V1Index = V2AdjIndex;
13126 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13127 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13128 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13129
13130 // Now proceed to reconstruct the final blend as we have the necessary
13131 // high or low half formed.
13132 if (V2Index < 2) {
13133 LowV = V2;
13134 HighV = V1;
13135 } else {
13136 HighV = V2;
13137 }
13138 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13139 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13140 }
13141 } else if (NumV2Elements == 2) {
13142 if (Mask[0] < 4 && Mask[1] < 4) {
13143 // Handle the easy case where we have V1 in the low lanes and V2 in the
13144 // high lanes.
13145 NewMask[2] -= 4;
13146 NewMask[3] -= 4;
13147 } else if (Mask[2] < 4 && Mask[3] < 4) {
13148 // We also handle the reversed case because this utility may get called
13149 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13150 // arrange things in the right direction.
13151 NewMask[0] -= 4;
13152 NewMask[1] -= 4;
13153 HighV = V1;
13154 LowV = V2;
13155 } else {
13156 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13157 // trying to place elements directly, just blend them and set up the final
13158 // shuffle to place them.
13159
13160 // The first two blend mask elements are for V1, the second two are for
13161 // V2.
13162 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13163 Mask[2] < 4 ? Mask[2] : Mask[3],
13164 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13165 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13166 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13167 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13168
13169 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13170 // a blend.
13171 LowV = HighV = V1;
13172 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13173 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13174 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13175 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13176 }
13177 }
13178 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13179 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13180}
13181
13182/// Lower 4-lane 32-bit floating point shuffles.
13183///
13184/// Uses instructions exclusively from the floating point unit to minimize
13185/// domain crossing penalties, as these are sufficient to implement all v4f32
13186/// shuffles.
13187static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13188 const APInt &Zeroable, SDValue V1, SDValue V2,
13189 const X86Subtarget &Subtarget,
13190 SelectionDAG &DAG) {
13191 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13191, __PRETTY_FUNCTION__))
;
13192 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13192, __PRETTY_FUNCTION__))
;
13193 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13193, __PRETTY_FUNCTION__))
;
13194
13195 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13196
13197 if (NumV2Elements == 0) {
13198 // Check for being able to broadcast a single element.
13199 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13200 Mask, Subtarget, DAG))
13201 return Broadcast;
13202
13203 // Use even/odd duplicate instructions for masks that match their pattern.
13204 if (Subtarget.hasSSE3()) {
13205 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13206 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13207 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
13208 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13209 }
13210
13211 if (Subtarget.hasAVX()) {
13212 // If we have AVX, we can use VPERMILPS which will allow folding a load
13213 // into the shuffle.
13214 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13215 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13216 }
13217
13218 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13219 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13220 if (!Subtarget.hasSSE2()) {
13221 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
13222 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13223 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
13224 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13225 }
13226
13227 // Otherwise, use a straight shuffle of a single input vector. We pass the
13228 // input vector to both operands to simulate this with a SHUFPS.
13229 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13230 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13231 }
13232
13233 if (Subtarget.hasAVX2())
13234 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13235 return Extract;
13236
13237 // There are special ways we can lower some single-element blends. However, we
13238 // have custom ways we can lower more complex single-element blends below that
13239 // we defer to if both this and BLENDPS fail to match, so restrict this to
13240 // when the V2 input is targeting element 0 of the mask -- that is the fast
13241 // case here.
13242 if (NumV2Elements == 1 && Mask[0] >= 4)
13243 if (SDValue V = lowerShuffleAsElementInsertion(
13244 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13245 return V;
13246
13247 if (Subtarget.hasSSE41()) {
13248 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13249 Zeroable, Subtarget, DAG))
13250 return Blend;
13251
13252 // Use INSERTPS if we can complete the shuffle efficiently.
13253 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13254 return V;
13255
13256 if (!isSingleSHUFPSMask(Mask))
13257 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13258 V2, Mask, DAG))
13259 return BlendPerm;
13260 }
13261
13262 // Use low/high mov instructions. These are only valid in SSE1 because
13263 // otherwise they are widened to v2f64 and never get here.
13264 if (!Subtarget.hasSSE2()) {
13265 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
13266 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13267 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
13268 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13269 }
13270
13271 // Use dedicated unpack instructions for masks that match their pattern.
13272 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13273 return V;
13274
13275 // Otherwise fall back to a SHUFPS lowering strategy.
13276 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13277}
13278
13279/// Lower 4-lane i32 vector shuffles.
13280///
13281/// We try to handle these with integer-domain shuffles where we can, but for
13282/// blends we use the floating point domain blend instructions.
13283static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13284 const APInt &Zeroable, SDValue V1, SDValue V2,
13285 const X86Subtarget &Subtarget,
13286 SelectionDAG &DAG) {
13287 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13287, __PRETTY_FUNCTION__))
;
13288 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13288, __PRETTY_FUNCTION__))
;
13289 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13289, __PRETTY_FUNCTION__))
;
13290
13291 // Whenever we can lower this as a zext, that instruction is strictly faster
13292 // than any alternative. It also allows us to fold memory operands into the
13293 // shuffle in many cases.
13294 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13295 Zeroable, Subtarget, DAG))
13296 return ZExt;
13297
13298 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13299
13300 if (NumV2Elements == 0) {
13301 // Try to use broadcast unless the mask only has one non-undef element.
13302 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13303 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13304 Mask, Subtarget, DAG))
13305 return Broadcast;
13306 }
13307
13308 // Straight shuffle of a single input vector. For everything from SSE2
13309 // onward this has a single fast instruction with no scary immediates.
13310 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13311 // but we aren't actually going to use the UNPCK instruction because doing
13312 // so prevents folding a load into this instruction or making a copy.
13313 const int UnpackLoMask[] = {0, 0, 1, 1};
13314 const int UnpackHiMask[] = {2, 2, 3, 3};
13315 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
13316 Mask = UnpackLoMask;
13317 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
13318 Mask = UnpackHiMask;
13319
13320 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13321 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13322 }
13323
13324 if (Subtarget.hasAVX2())
13325 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13326 return Extract;
13327
13328 // Try to use shift instructions.
13329 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
13330 Zeroable, Subtarget, DAG))
13331 return Shift;
13332
13333 // There are special ways we can lower some single-element blends.
13334 if (NumV2Elements == 1)
13335 if (SDValue V = lowerShuffleAsElementInsertion(
13336 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13337 return V;
13338
13339 // We have different paths for blend lowering, but they all must use the
13340 // *exact* same predicate.
13341 bool IsBlendSupported = Subtarget.hasSSE41();
13342 if (IsBlendSupported)
13343 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13344 Zeroable, Subtarget, DAG))
13345 return Blend;
13346
13347 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13348 Zeroable, Subtarget, DAG))
13349 return Masked;
13350
13351 // Use dedicated unpack instructions for masks that match their pattern.
13352 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13353 return V;
13354
13355 // Try to use byte rotation instructions.
13356 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13357 if (Subtarget.hasSSSE3()) {
13358 if (Subtarget.hasVLX())
13359 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
13360 Subtarget, DAG))
13361 return Rotate;
13362
13363 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13364 Subtarget, DAG))
13365 return Rotate;
13366 }
13367
13368 // Assume that a single SHUFPS is faster than an alternative sequence of
13369 // multiple instructions (even if the CPU has a domain penalty).
13370 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13371 if (!isSingleSHUFPSMask(Mask)) {
13372 // If we have direct support for blends, we should lower by decomposing into
13373 // a permute. That will be faster than the domain cross.
13374 if (IsBlendSupported)
13375 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
13376 Subtarget, DAG);
13377
13378 // Try to lower by permuting the inputs into an unpack instruction.
13379 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13380 Mask, Subtarget, DAG))
13381 return Unpack;
13382 }
13383
13384 // We implement this with SHUFPS because it can blend from two vectors.
13385 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13386 // up the inputs, bypassing domain shift penalties that we would incur if we
13387 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13388 // relevant.
13389 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13390 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13391 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13392 return DAG.getBitcast(MVT::v4i32, ShufPS);
13393}
13394
13395/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13396/// shuffle lowering, and the most complex part.
13397///
13398/// The lowering strategy is to try to form pairs of input lanes which are
13399/// targeted at the same half of the final vector, and then use a dword shuffle
13400/// to place them onto the right half, and finally unpack the paired lanes into
13401/// their final position.
13402///
13403/// The exact breakdown of how to form these dword pairs and align them on the
13404/// correct sides is really tricky. See the comments within the function for
13405/// more of the details.
13406///
13407/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13408/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13409/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13410/// vector, form the analogous 128-bit 8-element Mask.
13411static SDValue lowerV8I16GeneralSingleInputShuffle(
13412 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13413 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13414 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13414, __PRETTY_FUNCTION__))
;
13415 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13416
13417 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13417, __PRETTY_FUNCTION__))
;
13418 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13419 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13420
13421 // Attempt to directly match PSHUFLW or PSHUFHW.
13422 if (isUndefOrInRange(LoMask, 0, 4) &&
13423 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13424 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13425 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13426 }
13427 if (isUndefOrInRange(HiMask, 4, 8) &&
13428 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13429 for (int i = 0; i != 4; ++i)
13430 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13431 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13432 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13433 }
13434
13435 SmallVector<int, 4> LoInputs;
13436 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13437 array_pod_sort(LoInputs.begin(), LoInputs.end());
13438 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13439 SmallVector<int, 4> HiInputs;
13440 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13441 array_pod_sort(HiInputs.begin(), HiInputs.end());
13442 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13443 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13444 int NumHToL = LoInputs.size() - NumLToL;
13445 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13446 int NumHToH = HiInputs.size() - NumLToH;
13447 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13448 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13449 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13450 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13451
13452 // If we are shuffling values from one half - check how many different DWORD
13453 // pairs we need to create. If only 1 or 2 then we can perform this as a
13454 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13455 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13456 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13457 V = DAG.getNode(ShufWOp, DL, VT, V,
13458 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13459 V = DAG.getBitcast(PSHUFDVT, V);
13460 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13461 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13462 return DAG.getBitcast(VT, V);
13463 };
13464
13465 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13466 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13467 SmallVector<std::pair<int, int>, 4> DWordPairs;
13468 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13469
13470 // Collect the different DWORD pairs.
13471 for (int DWord = 0; DWord != 4; ++DWord) {
13472 int M0 = Mask[2 * DWord + 0];
13473 int M1 = Mask[2 * DWord + 1];
13474 M0 = (M0 >= 0 ? M0 % 4 : M0);
13475 M1 = (M1 >= 0 ? M1 % 4 : M1);
13476 if (M0 < 0 && M1 < 0)
13477 continue;
13478
13479 bool Match = false;
13480 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13481 auto &DWordPair = DWordPairs[j];
13482 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13483 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13484 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13485 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13486 PSHUFDMask[DWord] = DOffset + j;
13487 Match = true;
13488 break;
13489 }
13490 }
13491 if (!Match) {
13492 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13493 DWordPairs.push_back(std::make_pair(M0, M1));
13494 }
13495 }
13496
13497 if (DWordPairs.size() <= 2) {
13498 DWordPairs.resize(2, std::make_pair(-1, -1));
13499 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13500 DWordPairs[1].first, DWordPairs[1].second};
13501 if ((NumHToL + NumHToH) == 0)
13502 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13503 if ((NumLToL + NumLToH) == 0)
13504 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13505 }
13506 }
13507
13508 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13509 // such inputs we can swap two of the dwords across the half mark and end up
13510 // with <=2 inputs to each half in each half. Once there, we can fall through
13511 // to the generic code below. For example:
13512 //
13513 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13514 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13515 //
13516 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13517 // and an existing 2-into-2 on the other half. In this case we may have to
13518 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13519 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13520 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13521 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13522 // half than the one we target for fixing) will be fixed when we re-enter this
13523 // path. We will also combine away any sequence of PSHUFD instructions that
13524 // result into a single instruction. Here is an example of the tricky case:
13525 //
13526 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13527 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13528 //
13529 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13530 //
13531 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13532 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13533 //
13534 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13535 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13536 //
13537 // The result is fine to be handled by the generic logic.
13538 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13539 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13540 int AOffset, int BOffset) {
13541 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13542, __PRETTY_FUNCTION__))
13542 "Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13542, __PRETTY_FUNCTION__))
;
13543 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13544, __PRETTY_FUNCTION__))
13544 "Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13544, __PRETTY_FUNCTION__))
;
13545 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13546, __PRETTY_FUNCTION__))
13546 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13546, __PRETTY_FUNCTION__))
;
13547
13548 bool ThreeAInputs = AToAInputs.size() == 3;
13549
13550 // Compute the index of dword with only one word among the three inputs in
13551 // a half by taking the sum of the half with three inputs and subtracting
13552 // the sum of the actual three inputs. The difference is the remaining
13553 // slot.
13554 int ADWord = 0, BDWord = 0;
13555 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13556 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13557 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13558 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13559 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13560 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13561 int TripleNonInputIdx =
13562 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13563 TripleDWord = TripleNonInputIdx / 2;
13564
13565 // We use xor with one to compute the adjacent DWord to whichever one the
13566 // OneInput is in.
13567 OneInputDWord = (OneInput / 2) ^ 1;
13568
13569 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13570 // and BToA inputs. If there is also such a problem with the BToB and AToB
13571 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13572 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13573 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13574 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13575 // Compute how many inputs will be flipped by swapping these DWords. We
13576 // need
13577 // to balance this to ensure we don't form a 3-1 shuffle in the other
13578 // half.
13579 int NumFlippedAToBInputs =
13580 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
13581 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
13582 int NumFlippedBToBInputs =
13583 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
13584 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
13585 if ((NumFlippedAToBInputs == 1 &&
13586 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13587 (NumFlippedBToBInputs == 1 &&
13588 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13589 // We choose whether to fix the A half or B half based on whether that
13590 // half has zero flipped inputs. At zero, we may not be able to fix it
13591 // with that half. We also bias towards fixing the B half because that
13592 // will more commonly be the high half, and we have to bias one way.
13593 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13594 ArrayRef<int> Inputs) {
13595 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13596 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13597 // Determine whether the free index is in the flipped dword or the
13598 // unflipped dword based on where the pinned index is. We use this bit
13599 // in an xor to conditionally select the adjacent dword.
13600 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13601 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13602 if (IsFixIdxInput == IsFixFreeIdxInput)
13603 FixFreeIdx += 1;
13604 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13605 assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13606, __PRETTY_FUNCTION__))
13606 "We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13606, __PRETTY_FUNCTION__))
;
13607 int PSHUFHalfMask[] = {0, 1, 2, 3};
13608 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13609 V = DAG.getNode(
13610 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13611 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13612 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13613
13614 for (int &M : Mask)
13615 if (M >= 0 && M == FixIdx)
13616 M = FixFreeIdx;
13617 else if (M >= 0 && M == FixFreeIdx)
13618 M = FixIdx;
13619 };
13620 if (NumFlippedBToBInputs != 0) {
13621 int BPinnedIdx =
13622 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13623 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13624 } else {
13625 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13625, __PRETTY_FUNCTION__))
;
13626 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13627 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13628 }
13629 }
13630 }
13631
13632 int PSHUFDMask[] = {0, 1, 2, 3};
13633 PSHUFDMask[ADWord] = BDWord;
13634 PSHUFDMask[BDWord] = ADWord;
13635 V = DAG.getBitcast(
13636 VT,
13637 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13639
13640 // Adjust the mask to match the new locations of A and B.
13641 for (int &M : Mask)
13642 if (M >= 0 && M/2 == ADWord)
13643 M = 2 * BDWord + M % 2;
13644 else if (M >= 0 && M/2 == BDWord)
13645 M = 2 * ADWord + M % 2;
13646
13647 // Recurse back into this routine to re-compute state now that this isn't
13648 // a 3 and 1 problem.
13649 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13650 };
13651 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13652 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13653 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13654 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13655
13656 // At this point there are at most two inputs to the low and high halves from
13657 // each half. That means the inputs can always be grouped into dwords and
13658 // those dwords can then be moved to the correct half with a dword shuffle.
13659 // We use at most one low and one high word shuffle to collect these paired
13660 // inputs into dwords, and finally a dword shuffle to place them.
13661 int PSHUFLMask[4] = {-1, -1, -1, -1};
13662 int PSHUFHMask[4] = {-1, -1, -1, -1};
13663 int PSHUFDMask[4] = {-1, -1, -1, -1};
13664
13665 // First fix the masks for all the inputs that are staying in their
13666 // original halves. This will then dictate the targets of the cross-half
13667 // shuffles.
13668 auto fixInPlaceInputs =
13669 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13670 MutableArrayRef<int> SourceHalfMask,
13671 MutableArrayRef<int> HalfMask, int HalfOffset) {
13672 if (InPlaceInputs.empty())
13673 return;
13674 if (InPlaceInputs.size() == 1) {
13675 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13676 InPlaceInputs[0] - HalfOffset;
13677 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13678 return;
13679 }
13680 if (IncomingInputs.empty()) {
13681 // Just fix all of the in place inputs.
13682 for (int Input : InPlaceInputs) {
13683 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13684 PSHUFDMask[Input / 2] = Input / 2;
13685 }
13686 return;
13687 }
13688
13689 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13689, __PRETTY_FUNCTION__))
;
13690 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13691 InPlaceInputs[0] - HalfOffset;
13692 // Put the second input next to the first so that they are packed into
13693 // a dword. We find the adjacent index by toggling the low bit.
13694 int AdjIndex = InPlaceInputs[0] ^ 1;
13695 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13696 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13697 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13698 };
13699 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13700 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13701
13702 // Now gather the cross-half inputs and place them into a free dword of
13703 // their target half.
13704 // FIXME: This operation could almost certainly be simplified dramatically to
13705 // look more like the 3-1 fixing operation.
13706 auto moveInputsToRightHalf = [&PSHUFDMask](
13707 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13708 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13709 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13710 int DestOffset) {
13711 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13712 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13713 };
13714 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13715 int Word) {
13716 int LowWord = Word & ~1;
13717 int HighWord = Word | 1;
13718 return isWordClobbered(SourceHalfMask, LowWord) ||
13719 isWordClobbered(SourceHalfMask, HighWord);
13720 };
13721
13722 if (IncomingInputs.empty())
13723 return;
13724
13725 if (ExistingInputs.empty()) {
13726 // Map any dwords with inputs from them into the right half.
13727 for (int Input : IncomingInputs) {
13728 // If the source half mask maps over the inputs, turn those into
13729 // swaps and use the swapped lane.
13730 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13731 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13732 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13733 Input - SourceOffset;
13734 // We have to swap the uses in our half mask in one sweep.
13735 for (int &M : HalfMask)
13736 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13737 M = Input;
13738 else if (M == Input)
13739 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13740 } else {
13741 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13743, __PRETTY_FUNCTION__))
13742 Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13743, __PRETTY_FUNCTION__))
13743 "Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13743, __PRETTY_FUNCTION__))
;
13744 }
13745 // Note that this correctly re-maps both when we do a swap and when
13746 // we observe the other side of the swap above. We rely on that to
13747 // avoid swapping the members of the input list directly.
13748 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13749 }
13750
13751 // Map the input's dword into the correct half.
13752 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13753 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13754 else
13755 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13757, __PRETTY_FUNCTION__))
13756 Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13757, __PRETTY_FUNCTION__))
13757 "Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13757, __PRETTY_FUNCTION__))
;
13758 }
13759
13760 // And just directly shift any other-half mask elements to be same-half
13761 // as we will have mirrored the dword containing the element into the
13762 // same position within that half.
13763 for (int &M : HalfMask)
13764 if (M >= SourceOffset && M < SourceOffset + 4) {
13765 M = M - SourceOffset + DestOffset;
13766 assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13766, __PRETTY_FUNCTION__))
;
13767 }
13768 return;
13769 }
13770
13771 // Ensure we have the input in a viable dword of its current half. This
13772 // is particularly tricky because the original position may be clobbered
13773 // by inputs being moved and *staying* in that half.
13774 if (IncomingInputs.size() == 1) {
13775 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13776 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13777 SourceOffset;
13778 SourceHalfMask[InputFixed - SourceOffset] =
13779 IncomingInputs[0] - SourceOffset;
13780 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13781 InputFixed);
13782 IncomingInputs[0] = InputFixed;
13783 }
13784 } else if (IncomingInputs.size() == 2) {
13785 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13786 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13787 // We have two non-adjacent or clobbered inputs we need to extract from
13788 // the source half. To do this, we need to map them into some adjacent
13789 // dword slot in the source mask.
13790 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13791 IncomingInputs[1] - SourceOffset};
13792
13793 // If there is a free slot in the source half mask adjacent to one of
13794 // the inputs, place the other input in it. We use (Index XOR 1) to
13795 // compute an adjacent index.
13796 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13797 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13798 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13799 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13800 InputsFixed[1] = InputsFixed[0] ^ 1;
13801 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13802 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13803 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13804 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13805 InputsFixed[0] = InputsFixed[1] ^ 1;
13806 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13807 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13808 // The two inputs are in the same DWord but it is clobbered and the
13809 // adjacent DWord isn't used at all. Move both inputs to the free
13810 // slot.
13811 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13812 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13813 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13814 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13815 } else {
13816 // The only way we hit this point is if there is no clobbering
13817 // (because there are no off-half inputs to this half) and there is no
13818 // free slot adjacent to one of the inputs. In this case, we have to
13819 // swap an input with a non-input.
13820 for (int i = 0; i < 4; ++i)
13821 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13822, __PRETTY_FUNCTION__))
13822 "We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13822, __PRETTY_FUNCTION__))
;
13823 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13824, __PRETTY_FUNCTION__))
13824 "Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13824, __PRETTY_FUNCTION__))
;
13825
13826 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13827 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13828
13829 // We also have to update the final source mask in this case because
13830 // it may need to undo the above swap.
13831 for (int &M : FinalSourceHalfMask)
13832 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13833 M = InputsFixed[1] + SourceOffset;
13834 else if (M == InputsFixed[1] + SourceOffset)
13835 M = (InputsFixed[0] ^ 1) + SourceOffset;
13836
13837 InputsFixed[1] = InputsFixed[0] ^ 1;
13838 }
13839
13840 // Point everything at the fixed inputs.
13841 for (int &M : HalfMask)
13842 if (M == IncomingInputs[0])
13843 M = InputsFixed[0] + SourceOffset;
13844 else if (M == IncomingInputs[1])
13845 M = InputsFixed[1] + SourceOffset;
13846
13847 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13848 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13849 }
13850 } else {
13851 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13851)
;
13852 }
13853
13854 // Now hoist the DWord down to the right half.
13855 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13856 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ?
static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13856, __PRETTY_FUNCTION__))
;
13857 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13858 for (int &M : HalfMask)
13859 for (int Input : IncomingInputs)
13860 if (M == Input)
13861 M = FreeDWord * 2 + Input % 2;
13862 };
13863 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13864 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13865 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13866 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13867
13868 // Now enact all the shuffles we've computed to move the inputs into their
13869 // target half.
13870 if (!isNoopShuffleMask(PSHUFLMask))
13871 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13872 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13873 if (!isNoopShuffleMask(PSHUFHMask))
13874 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13875 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13876 if (!isNoopShuffleMask(PSHUFDMask))
13877 V = DAG.getBitcast(
13878 VT,
13879 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13880 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13881
13882 // At this point, each half should contain all its inputs, and we can then
13883 // just shuffle them into their final position.
13884 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13885, __PRETTY_FUNCTION__))
13885 "Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13885, __PRETTY_FUNCTION__))
;
13886 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13887, __PRETTY_FUNCTION__))
13887 "Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13887, __PRETTY_FUNCTION__))
;
13888
13889 // Do a half shuffle for the low mask.
13890 if (!isNoopShuffleMask(LoMask))
13891 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13892 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13893
13894 // Do a half shuffle with the high mask after shifting its values down.
13895 for (int &M : HiMask)
13896 if (M >= 0)
13897 M -= 4;
13898 if (!isNoopShuffleMask(HiMask))
13899 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13900 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13901
13902 return V;
13903}
13904
13905/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13906/// blend if only one input is used.
13907static SDValue lowerShuffleAsBlendOfPSHUFBs(
13908 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13909 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13910 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13911, __PRETTY_FUNCTION__))
13911 "Lane crossing shuffle masks not supported")((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13911, __PRETTY_FUNCTION__))
;
13912
13913 int NumBytes = VT.getSizeInBits() / 8;
13914 int Size = Mask.size();
13915 int Scale = NumBytes / Size;
13916
13917 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13918 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13919 V1InUse = false;
13920 V2InUse = false;
13921
13922 for (int i = 0; i < NumBytes; ++i) {
13923 int M = Mask[i / Scale];
13924 if (M < 0)
13925 continue;
13926
13927 const int ZeroMask = 0x80;
13928 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13929 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13930 if (Zeroable[i / Scale])
13931 V1Idx = V2Idx = ZeroMask;
13932
13933 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13934 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13935 V1InUse |= (ZeroMask != V1Idx);
13936 V2InUse |= (ZeroMask != V2Idx);
13937 }
13938
13939 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13940 if (V1InUse)
13941 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13942 DAG.getBuildVector(ShufVT, DL, V1Mask));
13943 if (V2InUse)
13944 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13945 DAG.getBuildVector(ShufVT, DL, V2Mask));
13946
13947 // If we need shuffled inputs from both, blend the two.
13948 SDValue V;
13949 if (V1InUse && V2InUse)
13950 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13951 else
13952 V = V1InUse ? V1 : V2;
13953
13954 // Cast the result back to the correct type.
13955 return DAG.getBitcast(VT, V);
13956}
13957
13958/// Generic lowering of 8-lane i16 shuffles.
13959///
13960/// This handles both single-input shuffles and combined shuffle/blends with
13961/// two inputs. The single input shuffles are immediately delegated to
13962/// a dedicated lowering routine.
13963///
13964/// The blends are lowered in one of three fundamental ways. If there are few
13965/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13966/// of the input is significantly cheaper when lowered as an interleaving of
13967/// the two inputs, try to interleave them. Otherwise, blend the low and high
13968/// halves of the inputs separately (making them have relatively few inputs)
13969/// and then concatenate them.
13970static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13971 const APInt &Zeroable, SDValue V1, SDValue V2,
13972 const X86Subtarget &Subtarget,
13973 SelectionDAG &DAG) {
13974 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13974, __PRETTY_FUNCTION__))
;
13975 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13975, __PRETTY_FUNCTION__))
;
13976 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13976, __PRETTY_FUNCTION__))
;
13977
13978 // Whenever we can lower this as a zext, that instruction is strictly faster
13979 // than any alternative.
13980 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13981 Zeroable, Subtarget, DAG))
13982 return ZExt;
13983
13984 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13985
13986 if (NumV2Inputs == 0) {
13987 // Try to use shift instructions.
13988 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
13989 Zeroable, Subtarget, DAG))
13990 return Shift;
13991
13992 // Check for being able to broadcast a single element.
13993 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13994 Mask, Subtarget, DAG))
13995 return Broadcast;
13996
13997 // Use dedicated unpack instructions for masks that match their pattern.
13998 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13999 return V;
14000
14001 // Use dedicated pack instructions for masks that match their pattern.
14002 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14003 Subtarget))
14004 return V;
14005
14006 // Try to use byte rotation instructions.
14007 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14008 Subtarget, DAG))
14009 return Rotate;
14010
14011 // Make a copy of the mask so it can be modified.
14012 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
14013 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14014 Subtarget, DAG);
14015 }
14016
14017 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __PRETTY_FUNCTION__))
14018 "All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __PRETTY_FUNCTION__))
14019 "shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __PRETTY_FUNCTION__))
;
14020
14021 // Try to use shift instructions.
14022 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
14023 Zeroable, Subtarget, DAG))
14024 return Shift;
14025
14026 // See if we can use SSE4A Extraction / Insertion.
14027 if (Subtarget.hasSSE4A())
14028 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14029 Zeroable, DAG))
14030 return V;
14031
14032 // There are special ways we can lower some single-element blends.
14033 if (NumV2Inputs == 1)
14034 if (SDValue V = lowerShuffleAsElementInsertion(
14035 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14036 return V;
14037
14038 // We have different paths for blend lowering, but they all must use the
14039 // *exact* same predicate.
14040 bool IsBlendSupported = Subtarget.hasSSE41();
14041 if (IsBlendSupported)
14042 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14043 Zeroable, Subtarget, DAG))
14044 return Blend;
14045
14046 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14047 Zeroable, Subtarget, DAG))
14048 return Masked;
14049
14050 // Use dedicated unpack instructions for masks that match their pattern.
14051 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14052 return V;
14053
14054 // Use dedicated pack instructions for masks that match their pattern.
14055 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14056 Subtarget))
14057 return V;
14058
14059 // Try to use byte rotation instructions.
14060 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14061 Subtarget, DAG))
14062 return Rotate;
14063
14064 if (SDValue BitBlend =
14065 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14066 return BitBlend;
14067
14068 // Try to use byte shift instructions to mask.
14069 if (SDValue V = lowerVectorShuffleAsByteShiftMask(
14070 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14071 return V;
14072
14073 // Try to lower by permuting the inputs into an unpack instruction.
14074 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14075 Mask, Subtarget, DAG))
14076 return Unpack;
14077
14078 // If we can't directly blend but can use PSHUFB, that will be better as it
14079 // can both shuffle and set up the inefficient blend.
14080 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14081 bool V1InUse, V2InUse;
14082 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14083 Zeroable, DAG, V1InUse, V2InUse);
14084 }
14085
14086 // We can always bit-blend if we have to so the fallback strategy is to
14087 // decompose into single-input permutes and blends.
14088 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
14089 Mask, Subtarget, DAG);
14090}
14091
14092/// Check whether a compaction lowering can be done by dropping even
14093/// elements and compute how many times even elements must be dropped.
14094///
14095/// This handles shuffles which take every Nth element where N is a power of
14096/// two. Example shuffle masks:
14097///
14098/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
14099/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
14100/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
14101/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
14102/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
14103/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
14104///
14105/// Any of these lanes can of course be undef.
14106///
14107/// This routine only supports N <= 3.
14108/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
14109/// for larger N.
14110///
14111/// \returns N above, or the number of times even elements must be dropped if
14112/// there is such a number. Otherwise returns zero.
14113static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
14114 bool IsSingleInput) {
14115 // The modulus for the shuffle vector entries is based on whether this is
14116 // a single input or not.
14117 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
14118 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14119, __PRETTY_FUNCTION__))
14119 "We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14119, __PRETTY_FUNCTION__))
;
14120
14121 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
14122
14123 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
14124 // and 2^3 simultaneously. This is because we may have ambiguity with
14125 // partially undef inputs.
14126 bool ViableForN[3] = {true, true, true};
14127
14128 for (int i = 0, e = Mask.size(); i < e; ++i) {
14129 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
14130 // want.
14131 if (Mask[i] < 0)
14132 continue;
14133
14134 bool IsAnyViable = false;
14135 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14136 if (ViableForN[j]) {
14137 uint64_t N = j + 1;
14138
14139 // The shuffle mask must be equal to (i * 2^N) % M.
14140 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
14141 IsAnyViable = true;
14142 else
14143 ViableForN[j] = false;
14144 }
14145 // Early exit if we exhaust the possible powers of two.
14146 if (!IsAnyViable)
14147 break;
14148 }
14149
14150 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14151 if (ViableForN[j])
14152 return j + 1;
14153
14154 // Return 0 as there is no viable power of two.
14155 return 0;
14156}
14157
14158static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
14159 ArrayRef<int> Mask, SDValue V1,
14160 SDValue V2, SelectionDAG &DAG) {
14161 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
14162 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
14163
14164 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
14165 if (V2.isUndef())
14166 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
14167
14168 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
14169}
14170
14171/// Generic lowering of v16i8 shuffles.
14172///
14173/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14174/// detect any complexity reducing interleaving. If that doesn't help, it uses
14175/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14176/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14177/// back together.
14178static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14179 const APInt &Zeroable, SDValue V1, SDValue V2,
14180 const X86Subtarget &Subtarget,
14181 SelectionDAG &DAG) {
14182 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14182, __PRETTY_FUNCTION__))
;
14183 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14183, __PRETTY_FUNCTION__))
;
14184 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14184, __PRETTY_FUNCTION__))
;
14185
14186 // Try to use shift instructions.
14187 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
14188 Zeroable, Subtarget, DAG))
14189 return Shift;
14190
14191 // Try to use byte rotation instructions.
14192 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14193 Subtarget, DAG))
14194 return Rotate;
14195
14196 // Use dedicated pack instructions for masks that match their pattern.
14197 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14198 Subtarget))
14199 return V;
14200
14201 // Try to use a zext lowering.
14202 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14203 Zeroable, Subtarget, DAG))
14204 return ZExt;
14205
14206 // See if we can use SSE4A Extraction / Insertion.
14207 if (Subtarget.hasSSE4A())
14208 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14209 Zeroable, DAG))
14210 return V;
14211
14212 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14213
14214 // For single-input shuffles, there are some nicer lowering tricks we can use.
14215 if (NumV2Elements == 0) {
14216 // Check for being able to broadcast a single element.
14217 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14218 Mask, Subtarget, DAG))
14219 return Broadcast;
14220
14221 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14222 return V;
14223
14224 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14225 // Notably, this handles splat and partial-splat shuffles more efficiently.
14226 // However, it only makes sense if the pre-duplication shuffle simplifies
14227 // things significantly. Currently, this means we need to be able to
14228 // express the pre-duplication shuffle as an i16 shuffle.
14229 //
14230 // FIXME: We should check for other patterns which can be widened into an
14231 // i16 shuffle as well.
14232 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14233 for (int i = 0; i < 16; i += 2)
14234 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14235 return false;
14236
14237 return true;
14238 };
14239 auto tryToWidenViaDuplication = [&]() -> SDValue {
14240 if (!canWidenViaDuplication(Mask))
14241 return SDValue();
14242 SmallVector<int, 4> LoInputs;
14243 copy_if(Mask, std::back_inserter(LoInputs),
14244 [](int M) { return M >= 0 && M < 8; });
14245 array_pod_sort(LoInputs.begin(), LoInputs.end());
14246 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14247 LoInputs.end());
14248 SmallVector<int, 4> HiInputs;
14249 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14250 array_pod_sort(HiInputs.begin(), HiInputs.end());
14251 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14252 HiInputs.end());
14253
14254 bool TargetLo = LoInputs.size() >= HiInputs.size();
14255 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14256 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14257
14258 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14259 SmallDenseMap<int, int, 8> LaneMap;
14260 for (int I : InPlaceInputs) {
14261 PreDupI16Shuffle[I/2] = I/2;
14262 LaneMap[I] = I;
14263 }
14264 int j = TargetLo ? 0 : 4, je = j + 4;
14265 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14266 // Check if j is already a shuffle of this input. This happens when
14267 // there are two adjacent bytes after we move the low one.
14268 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14269 // If we haven't yet mapped the input, search for a slot into which
14270 // we can map it.
14271 while (j < je && PreDupI16Shuffle[j] >= 0)
14272 ++j;
14273
14274 if (j == je)
14275 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14276 return SDValue();
14277
14278 // Map this input with the i16 shuffle.
14279 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14280 }
14281
14282 // Update the lane map based on the mapping we ended up with.
14283 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14284 }
14285 V1 = DAG.getBitcast(
14286 MVT::v16i8,
14287 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14288 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14289
14290 // Unpack the bytes to form the i16s that will be shuffled into place.
14291 bool EvenInUse = false, OddInUse = false;
14292 for (int i = 0; i < 16; i += 2) {
14293 EvenInUse |= (Mask[i + 0] >= 0);
14294 OddInUse |= (Mask[i + 1] >= 0);
14295 if (EvenInUse && OddInUse)
14296 break;
14297 }
14298 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14299 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14300 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14301
14302 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14303 for (int i = 0; i < 16; ++i)
14304 if (Mask[i] >= 0) {
14305 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14306 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14306, __PRETTY_FUNCTION__))
;
14307 if (PostDupI16Shuffle[i / 2] < 0)
14308 PostDupI16Shuffle[i / 2] = MappedMask;
14309 else
14310 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14311, __PRETTY_FUNCTION__))
14311 "Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14311, __PRETTY_FUNCTION__))
;
14312 }
14313 return DAG.getBitcast(
14314 MVT::v16i8,
14315 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14316 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14317 };
14318 if (SDValue V = tryToWidenViaDuplication())
14319 return V;
14320 }
14321
14322 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14323 Zeroable, Subtarget, DAG))
14324 return Masked;
14325
14326 // Use dedicated unpack instructions for masks that match their pattern.
14327 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14328 return V;
14329
14330 // Try to use byte shift instructions to mask.
14331 if (SDValue V = lowerVectorShuffleAsByteShiftMask(
14332 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14333 return V;
14334
14335 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14336 // with PSHUFB. It is important to do this before we attempt to generate any
14337 // blends but after all of the single-input lowerings. If the single input
14338 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14339 // want to preserve that and we can DAG combine any longer sequences into
14340 // a PSHUFB in the end. But once we start blending from multiple inputs,
14341 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14342 // and there are *very* few patterns that would actually be faster than the
14343 // PSHUFB approach because of its ability to zero lanes.
14344 //
14345 // FIXME: The only exceptions to the above are blends which are exact
14346 // interleavings with direct instructions supporting them. We currently don't
14347 // handle those well here.
14348 if (Subtarget.hasSSSE3()) {
14349 bool V1InUse = false;
14350 bool V2InUse = false;
14351
14352 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
14353 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14354
14355 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14356 // do so. This avoids using them to handle blends-with-zero which is
14357 // important as a single pshufb is significantly faster for that.
14358 if (V1InUse && V2InUse) {
14359 if (Subtarget.hasSSE41())
14360 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14361 Zeroable, Subtarget, DAG))
14362 return Blend;
14363
14364 // We can use an unpack to do the blending rather than an or in some
14365 // cases. Even though the or may be (very minorly) more efficient, we
14366 // preference this lowering because there are common cases where part of
14367 // the complexity of the shuffles goes away when we do the final blend as
14368 // an unpack.
14369 // FIXME: It might be worth trying to detect if the unpack-feeding
14370 // shuffles will both be pshufb, in which case we shouldn't bother with
14371 // this.
14372 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
14373 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14374 return Unpack;
14375
14376 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
14377 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14378 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
14379
14380 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14381 // PALIGNR will be cheaper than the second PSHUFB+OR.
14382 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
14383 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14384 return V;
14385 }
14386
14387 return PSHUFB;
14388 }
14389
14390 // There are special ways we can lower some single-element blends.
14391 if (NumV2Elements == 1)
14392 if (SDValue V = lowerShuffleAsElementInsertion(
14393 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14394 return V;
14395
14396 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14397 return Blend;
14398
14399 // Check whether a compaction lowering can be done. This handles shuffles
14400 // which take every Nth element for some even N. See the helper function for
14401 // details.
14402 //
14403 // We special case these as they can be particularly efficiently handled with
14404 // the PACKUSB instruction on x86 and they show up in common patterns of
14405 // rearranging bytes to truncate wide elements.
14406 bool IsSingleInput = V2.isUndef();
14407 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
14408 // NumEvenDrops is the power of two stride of the elements. Another way of
14409 // thinking about it is that we need to drop the even elements this many
14410 // times to get the original input.
14411
14412 // First we need to zero all the dropped bytes.
14413 assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14414, __PRETTY_FUNCTION__))
14414 "No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14414, __PRETTY_FUNCTION__))
;
14415 SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
14416 for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
14417 ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
14418 SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
14419 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
14420 if (!IsSingleInput)
14421 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
14422
14423 // Now pack things back together.
14424 V1 = DAG.getBitcast(MVT::v8i16, V1);
14425 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
14426 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
14427 for (int i = 1; i < NumEvenDrops; ++i) {
14428 Result = DAG.getBitcast(MVT::v8i16, Result);
14429 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14430 }
14431
14432 return Result;
14433 }
14434
14435 // Handle multi-input cases by blending single-input shuffles.
14436 if (NumV2Elements > 0)
14437 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
14438 Subtarget, DAG);
14439
14440 // The fallback path for single-input shuffles widens this into two v8i16
14441 // vectors with unpacks, shuffles those, and then pulls them back together
14442 // with a pack.
14443 SDValue V = V1;
14444
14445 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14446 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14447 for (int i = 0; i < 16; ++i)
14448 if (Mask[i] >= 0)
14449 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14450
14451 SDValue VLoHalf, VHiHalf;
14452 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14453 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14454 // i16s.
14455 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14456 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14457 // Use a mask to drop the high bytes.
14458 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14459 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14460 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14461
14462 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14463 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14464
14465 // Squash the masks to point directly into VLoHalf.
14466 for (int &M : LoBlendMask)
14467 if (M >= 0)
14468 M /= 2;
14469 for (int &M : HiBlendMask)
14470 if (M >= 0)
14471 M /= 2;
14472 } else {
14473 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14474 // VHiHalf so that we can blend them as i16s.
14475 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14476
14477 VLoHalf = DAG.getBitcast(
14478 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14479 VHiHalf = DAG.getBitcast(
14480 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14481 }
14482
14483 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14484 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14485
14486 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14487}
14488
14489/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14490///
14491/// This routine breaks down the specific type of 128-bit shuffle and
14492/// dispatches to the lowering routines accordingly.
14493static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14494 MVT VT, SDValue V1, SDValue V2,
14495 const APInt &Zeroable,
14496 const X86Subtarget &Subtarget,
14497 SelectionDAG &DAG) {
14498 switch (VT.SimpleTy) {
14499 case MVT::v2i64:
14500 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14501 case MVT::v2f64:
14502 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14503 case MVT::v4i32:
14504 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14505 case MVT::v4f32:
14506 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14507 case MVT::v8i16:
14508 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14509 case MVT::v16i8:
14510 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14511
14512 default:
14513 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14513)
;
14514 }
14515}
14516
14517/// Generic routine to split vector shuffle into half-sized shuffles.
14518///
14519/// This routine just extracts two subvectors, shuffles them independently, and
14520/// then concatenates them back together. This should work effectively with all
14521/// AVX vector shuffle types.
14522static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14523 SDValue V2, ArrayRef<int> Mask,
14524 SelectionDAG &DAG) {
14525 assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14526, __PRETTY_FUNCTION__))
14526 "Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14526, __PRETTY_FUNCTION__))
;
14527 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14527, __PRETTY_FUNCTION__))
;
14528 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14528, __PRETTY_FUNCTION__))
;
14529
14530 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14531 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14532
14533 int NumElements = VT.getVectorNumElements();
14534 int SplitNumElements = NumElements / 2;
14535 MVT ScalarVT = VT.getVectorElementType();
14536 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
14537
14538 // Rather than splitting build-vectors, just build two narrower build
14539 // vectors. This helps shuffling with splats and zeros.
14540 auto SplitVector = [&](SDValue V) {
14541 V = peekThroughBitcasts(V);
14542
14543 MVT OrigVT = V.getSimpleValueType();
14544 int OrigNumElements = OrigVT.getVectorNumElements();
14545 int OrigSplitNumElements = OrigNumElements / 2;
14546 MVT OrigScalarVT = OrigVT.getVectorElementType();
14547 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
14548
14549 SDValue LoV, HiV;
14550
14551 auto *BV = dyn_cast<BuildVectorSDNode>(V);
14552 if (!BV) {
14553 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14554 DAG.getIntPtrConstant(0, DL));
14555 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14556 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
14557 } else {
14558
14559 SmallVector<SDValue, 16> LoOps, HiOps;
14560 for (int i = 0; i < OrigSplitNumElements; ++i) {
14561 LoOps.push_back(BV->getOperand(i));
14562 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
14563 }
14564 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
14565 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
14566 }
14567 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14568 DAG.getBitcast(SplitVT, HiV));
14569 };
14570
14571 SDValue LoV1, HiV1, LoV2, HiV2;
14572 std::tie(LoV1, HiV1) = SplitVector(V1);
14573 std::tie(LoV2, HiV2) = SplitVector(V2);
14574
14575 // Now create two 4-way blends of these half-width vectors.
14576 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14577 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
14578 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14579 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14580 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14581 for (int i = 0; i < SplitNumElements; ++i) {
14582 int M = HalfMask[i];
14583 if (M >= NumElements) {
14584 if (M >= NumElements + SplitNumElements)
14585 UseHiV2 = true;
14586 else
14587 UseLoV2 = true;
14588 V2BlendMask[i] = M - NumElements;
14589 BlendMask[i] = SplitNumElements + i;
14590 } else if (M >= 0) {
14591 if (M >= SplitNumElements)
14592 UseHiV1 = true;
14593 else
14594 UseLoV1 = true;
14595 V1BlendMask[i] = M;
14596 BlendMask[i] = i;
14597 }
14598 }
14599
14600 // Because the lowering happens after all combining takes place, we need to
14601 // manually combine these blend masks as much as possible so that we create
14602 // a minimal number of high-level vector shuffle nodes.
14603
14604 // First try just blending the halves of V1 or V2.
14605 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14606 return DAG.getUNDEF(SplitVT);
14607 if (!UseLoV2 && !UseHiV2)
14608 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14609 if (!UseLoV1 && !UseHiV1)
14610 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14611
14612 SDValue V1Blend, V2Blend;
14613 if (UseLoV1 && UseHiV1) {
14614 V1Blend =
14615 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14616 } else {
14617 // We only use half of V1 so map the usage down into the final blend mask.
14618 V1Blend = UseLoV1 ? LoV1 : HiV1;
14619 for (int i = 0; i < SplitNumElements; ++i)
14620 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14621 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14622 }
14623 if (UseLoV2 && UseHiV2) {
14624 V2Blend =
14625 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14626 } else {
14627 // We only use half of V2 so map the usage down into the final blend mask.
14628 V2Blend = UseLoV2 ? LoV2 : HiV2;
14629 for (int i = 0; i < SplitNumElements; ++i)
14630 if (BlendMask[i] >= SplitNumElements)
14631 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14632 }
14633 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14634 };
14635 SDValue Lo = HalfBlend(LoMask);
14636 SDValue Hi = HalfBlend(HiMask);
14637 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14638}
14639
14640/// Either split a vector in halves or decompose the shuffles and the
14641/// blend.
14642///
14643/// This is provided as a good fallback for many lowerings of non-single-input
14644/// shuffles with more than one 128-bit lane. In those cases, we want to select
14645/// between splitting the shuffle into 128-bit components and stitching those
14646/// back together vs. extracting the single-input shuffles and blending those
14647/// results.
14648static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14649 SDValue V2, ArrayRef<int> Mask,
14650 const X86Subtarget &Subtarget,
14651 SelectionDAG &DAG) {
14652 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14653, __PRETTY_FUNCTION__))
14653 "shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14653, __PRETTY_FUNCTION__))
;
14654 int Size = Mask.size();
14655
14656 // If this can be modeled as a broadcast of two elements followed by a blend,
14657 // prefer that lowering. This is especially important because broadcasts can
14658 // often fold with memory operands.
14659 auto DoBothBroadcast = [&] {
14660 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14661 for (int M : Mask)
14662 if (M >= Size) {
14663 if (V2BroadcastIdx < 0)
14664 V2BroadcastIdx = M - Size;
14665 else if (M - Size != V2BroadcastIdx)
14666 return false;
14667 } else if (M >= 0) {
14668 if (V1BroadcastIdx < 0)
14669 V1BroadcastIdx = M;
14670 else if (M != V1BroadcastIdx)
14671 return false;
14672 }
14673 return true;
14674 };
14675 if (DoBothBroadcast())
14676 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
14677 Subtarget, DAG);
14678
14679 // If the inputs all stem from a single 128-bit lane of each input, then we
14680 // split them rather than blending because the split will decompose to
14681 // unusually few instructions.
14682 int LaneCount = VT.getSizeInBits() / 128;
14683 int LaneSize = Size / LaneCount;
14684 SmallBitVector LaneInputs[2];
14685 LaneInputs[0].resize(LaneCount, false);
14686 LaneInputs[1].resize(LaneCount, false);
14687 for (int i = 0; i < Size; ++i)
14688 if (Mask[i] >= 0)
14689 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14690 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14691 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
14692
14693 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
14694 // that the decomposed single-input shuffles don't end up here.
14695 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
14696 DAG);
14697}
14698
14699/// Lower a vector shuffle crossing multiple 128-bit lanes as
14700/// a lane permutation followed by a per-lane permutation.
14701///
14702/// This is mainly for cases where we can have non-repeating permutes
14703/// in each lane.
14704///
14705/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14706/// we should investigate merging them.
14707static SDValue lowerShuffleAsLanePermuteAndPermute(
14708 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14709 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14710 int NumElts = VT.getVectorNumElements();
14711 int NumLanes = VT.getSizeInBits() / 128;
14712 int NumEltsPerLane = NumElts / NumLanes;
14713
14714 SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
14715 SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
14716
14717 for (int i = 0; i != NumElts; ++i) {
14718 int M = Mask[i];
14719 if (M < 0)
14720 continue;
14721
14722 // Ensure that each lane comes from a single source lane.
14723 int SrcLane = M / NumEltsPerLane;
14724 int DstLane = i / NumEltsPerLane;
14725 if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
14726 return SDValue();
14727 SrcLaneMask[DstLane] = SrcLane;
14728
14729 PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
14730 }
14731
14732 // Make sure we set all elements of the lane mask, to avoid undef propagation.
14733 SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
14734 for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
14735 int SrcLane = SrcLaneMask[DstLane];
14736 if (0 <= SrcLane)
14737 for (int j = 0; j != NumEltsPerLane; ++j) {
14738 LaneMask[(DstLane * NumEltsPerLane) + j] =
14739 (SrcLane * NumEltsPerLane) + j;
14740 }
14741 }
14742
14743 // If we're only shuffling a single lowest lane and the rest are identity
14744 // then don't bother.
14745 // TODO - isShuffleMaskInputInPlace could be extended to something like this.
14746 int NumIdentityLanes = 0;
14747 bool OnlyShuffleLowestLane = true;
14748 for (int i = 0; i != NumLanes; ++i) {
14749 if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
14750 i * NumEltsPerLane))
14751 NumIdentityLanes++;
14752 else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
14753 OnlyShuffleLowestLane = false;
14754 }
14755 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14756 return SDValue();
14757
14758 SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
14759 return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
14760}
14761
14762/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14763/// source with a lane permutation.
14764///
14765/// This lowering strategy results in four instructions in the worst case for a
14766/// single-input cross lane shuffle which is lower than any other fully general
14767/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14768/// shuffle pattern should be handled prior to trying this lowering.
14769static SDValue lowerShuffleAsLanePermuteAndShuffle(
14770 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14771 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14772 // FIXME: This should probably be generalized for 512-bit vectors as well.
14773 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14773, __PRETTY_FUNCTION__))
;
14774 int Size = Mask.size();
14775 int LaneSize = Size / 2;
14776
14777 // If there are only inputs from one 128-bit lane, splitting will in fact be
14778 // less expensive. The flags track whether the given lane contains an element
14779 // that crosses to another lane.
14780 if (!Subtarget.hasAVX2()) {
14781 bool LaneCrossing[2] = {false, false};
14782 for (int i = 0; i < Size; ++i)
14783 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
14784 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14785 if (!LaneCrossing[0] || !LaneCrossing[1])
14786 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
14787 } else {
14788 bool LaneUsed[2] = {false, false};
14789 for (int i = 0; i < Size; ++i)
14790 if (Mask[i] >= 0)
14791 LaneUsed[(Mask[i] / LaneSize)] = true;
14792 if (!LaneUsed[0] || !LaneUsed[1])
14793 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
14794 }
14795
14796 // TODO - we could support shuffling V2 in the Flipped input.
14797 assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14798, __PRETTY_FUNCTION__))
14798 "This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14798, __PRETTY_FUNCTION__))
;
14799
14800 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
14801 for (int i = 0; i < Size; ++i) {
14802 int &M = InLaneMask[i];
14803 if (M < 0)
14804 continue;
14805 if (((M % Size) / LaneSize) != (i / LaneSize))
14806 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14807 }
14808 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14809, __PRETTY_FUNCTION__))
14809 "In-lane shuffle mask expected")((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14809, __PRETTY_FUNCTION__))
;
14810
14811 // Flip the lanes, and shuffle the results which should now be in-lane.
14812 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14813 SDValue Flipped = DAG.getBitcast(PVT, V1);
14814 Flipped =
14815 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14816 Flipped = DAG.getBitcast(VT, Flipped);
14817 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14818}
14819
14820/// Handle lowering 2-lane 128-bit shuffles.
14821static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
14822 SDValue V2, ArrayRef<int> Mask,
14823 const APInt &Zeroable,
14824 const X86Subtarget &Subtarget,
14825 SelectionDAG &DAG) {
14826 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14827 if (Subtarget.hasAVX2() && V2.isUndef())
14828 return SDValue();
14829
14830 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14831
14832 SmallVector<int, 4> WidenedMask;
14833 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14834 return SDValue();
14835
14836 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14837 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14838
14839 // Try to use an insert into a zero vector.
14840 if (WidenedMask[0] == 0 && IsHighZero) {
14841 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14842 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14843 DAG.getIntPtrConstant(0, DL));
14844 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14845 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14846 DAG.getIntPtrConstant(0, DL));
14847 }
14848
14849 // TODO: If minimizing size and one of the inputs is a zero vector and the
14850 // the zero vector has only one use, we could use a VPERM2X128 to save the
14851 // instruction bytes needed to explicitly generate the zero vector.
14852
14853 // Blends are faster and handle all the non-lane-crossing cases.
14854 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14855 Subtarget, DAG))
14856 return Blend;
14857
14858 // If either input operand is a zero vector, use VPERM2X128 because its mask
14859 // allows us to replace the zero input with an implicit zero.
14860 if (!IsLowZero && !IsHighZero) {
14861 // Check for patterns which can be matched with a single insert of a 128-bit
14862 // subvector.
14863 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
14864 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
14865
14866 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14867 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14868 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14869 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14870 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14871 OnlyUsesV1 ? V1 : V2,
14872 DAG.getIntPtrConstant(0, DL));
14873 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14874 DAG.getIntPtrConstant(2, DL));
14875 }
14876 }
14877
14878 // Try to use SHUF128 if possible.
14879 if (Subtarget.hasVLX()) {
14880 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14881 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14882 ((WidenedMask[1] % 2) << 1);
14883 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14884 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14885 }
14886 }
14887 }
14888
14889 // Otherwise form a 128-bit permutation. After accounting for undefs,
14890 // convert the 64-bit shuffle mask selection values into 128-bit
14891 // selection bits by dividing the indexes by 2 and shifting into positions
14892 // defined by a vperm2*128 instruction's immediate control byte.
14893
14894 // The immediate permute control byte looks like this:
14895 // [1:0] - select 128 bits from sources for low half of destination
14896 // [2] - ignore
14897 // [3] - zero low half of destination
14898 // [5:4] - select 128 bits from sources for high half of destination
14899 // [6] - ignore
14900 // [7] - zero high half of destination
14901
14902 assert((WidenedMask[0] >= 0 || IsLowZero) &&(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14903, __PRETTY_FUNCTION__))
14903 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14903, __PRETTY_FUNCTION__))
;
14904
14905 unsigned PermMask = 0;
14906 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
14907 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
14908
14909 // Check the immediate mask and replace unused sources with undef.
14910 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
14911 V1 = DAG.getUNDEF(VT);
14912 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
14913 V2 = DAG.getUNDEF(VT);
14914
14915 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
14916 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14917}
14918
14919/// Lower a vector shuffle by first fixing the 128-bit lanes and then
14920/// shuffling each lane.
14921///
14922/// This attempts to create a repeated lane shuffle where each lane uses one
14923/// or two of the lanes of the inputs. The lanes of the input vectors are
14924/// shuffled in one or two independent shuffles to get the lanes into the
14925/// position needed by the final shuffle.
14926static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
14927 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14928 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14929 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14929, __PRETTY_FUNCTION__))
;
14930
14931 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
14932 return SDValue();
14933
14934 int NumElts = Mask.size();
14935 int NumLanes = VT.getSizeInBits() / 128;
14936 int NumLaneElts = 128 / VT.getScalarSizeInBits();
14937 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
14938 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
14939
14940 // First pass will try to fill in the RepeatMask from lanes that need two
14941 // sources.
14942 for (int Lane = 0; Lane != NumLanes; ++Lane) {
14943 int Srcs[2] = {-1, -1};
14944 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
14945 for (int i = 0; i != NumLaneElts; ++i) {
14946 int M = Mask[(Lane * NumLaneElts) + i];
14947 if (M < 0)
14948 continue;
14949 // Determine which of the possible input lanes (NumLanes from each source)
14950 // this element comes from. Assign that as one of the sources for this
14951 // lane. We can assign up to 2 sources for this lane. If we run out
14952 // sources we can't do anything.
14953 int LaneSrc = M / NumLaneElts;
14954 int Src;
14955 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
14956 Src = 0;
14957 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
14958 Src = 1;
14959 else
14960 return SDValue();
14961
14962 Srcs[Src] = LaneSrc;
14963 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
14964 }
14965
14966 // If this lane has two sources, see if it fits with the repeat mask so far.
14967 if (Srcs[1] < 0)
14968 continue;
14969
14970 LaneSrcs[Lane][0] = Srcs[0];
14971 LaneSrcs[Lane][1] = Srcs[1];
14972
14973 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
14974 assert(M1.size() == M2.size() && "Unexpected mask size")((M1.size() == M2.size() && "Unexpected mask size") ?
static_cast<void> (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14974, __PRETTY_FUNCTION__))
;
14975 for (int i = 0, e = M1.size(); i != e; ++i)
14976 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
14977 return false;
14978 return true;
14979 };
14980
14981 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
14982 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((Mask.size() == MergedMask.size() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14982, __PRETTY_FUNCTION__))
;
14983 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
14984 int M = Mask[i];
14985 if (M < 0)
14986 continue;
14987 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14988, __PRETTY_FUNCTION__))
14988 "Unexpected mask element")(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14988, __PRETTY_FUNCTION__))
;
14989 MergedMask[i] = M;
14990 }
14991 };
14992
14993 if (MatchMasks(InLaneMask, RepeatMask)) {
14994 // Merge this lane mask into the final repeat mask.
14995 MergeMasks(InLaneMask, RepeatMask);
14996 continue;
14997 }
14998
14999 // Didn't find a match. Swap the operands and try again.
15000 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15001 ShuffleVectorSDNode::commuteMask(InLaneMask);
15002
15003 if (MatchMasks(InLaneMask, RepeatMask)) {
15004 // Merge this lane mask into the final repeat mask.
15005 MergeMasks(InLaneMask, RepeatMask);
15006 continue;
15007 }
15008
15009 // Couldn't find a match with the operands in either order.
15010 return SDValue();
15011 }
15012
15013 // Now handle any lanes with only one source.
15014 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15015 // If this lane has already been processed, skip it.
15016 if (LaneSrcs[Lane][0] >= 0)
15017 continue;
15018
15019 for (int i = 0; i != NumLaneElts; ++i) {
15020 int M = Mask[(Lane * NumLaneElts) + i];
15021 if (M < 0)
15022 continue;
15023
15024 // If RepeatMask isn't defined yet we can define it ourself.
15025 if (RepeatMask[i] < 0)
15026 RepeatMask[i] = M % NumLaneElts;
15027
15028 if (RepeatMask[i] < NumElts) {
15029 if (RepeatMask[i] != M % NumLaneElts)
15030 return SDValue();
15031 LaneSrcs[Lane][0] = M / NumLaneElts;
15032 } else {
15033 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15034 return SDValue();
15035 LaneSrcs[Lane][1] = M / NumLaneElts;
15036 }
15037 }
15038
15039 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15040 return SDValue();
15041 }
15042
15043 SmallVector<int, 16> NewMask(NumElts, -1);
15044 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15045 int Src = LaneSrcs[Lane][0];
15046 for (int i = 0; i != NumLaneElts; ++i) {
15047 int M = -1;
15048 if (Src >= 0)
15049 M = Src * NumLaneElts + i;
15050 NewMask[Lane * NumLaneElts + i] = M;
15051 }
15052 }
15053 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15054 // Ensure we didn't get back the shuffle we started with.
15055 // FIXME: This is a hack to make up for some splat handling code in
15056 // getVectorShuffle.
15057 if (isa<ShuffleVectorSDNode>(NewV1) &&
15058 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15059 return SDValue();
15060
15061 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15062 int Src = LaneSrcs[Lane][1];
15063 for (int i = 0; i != NumLaneElts; ++i) {
15064 int M = -1;
15065 if (Src >= 0)
15066 M = Src * NumLaneElts + i;
15067 NewMask[Lane * NumLaneElts + i] = M;
15068 }
15069 }
15070 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15071 // Ensure we didn't get back the shuffle we started with.
15072 // FIXME: This is a hack to make up for some splat handling code in
15073 // getVectorShuffle.
15074 if (isa<ShuffleVectorSDNode>(NewV2) &&
15075 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15076 return SDValue();
15077
15078 for (int i = 0; i != NumElts; ++i) {
15079 NewMask[i] = RepeatMask[i % NumLaneElts];
15080 if (NewMask[i] < 0)
15081 continue;
15082
15083 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15084 }
15085 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15086}
15087
15088/// If the input shuffle mask results in a vector that is undefined in all upper
15089/// or lower half elements and that mask accesses only 2 halves of the
15090/// shuffle's operands, return true. A mask of half the width with mask indexes
15091/// adjusted to access the extracted halves of the original shuffle operands is
15092/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15093/// lower half of each input operand is accessed.
15094static bool
15095getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
15096 int &HalfIdx1, int &HalfIdx2) {
15097 assert((Mask.size() == HalfMask.size() * 2) &&(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15098, __PRETTY_FUNCTION__))
15098 "Expected input mask to be twice as long as output")(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15098, __PRETTY_FUNCTION__))
;
15099
15100 // Exactly one half of the result must be undef to allow narrowing.
15101 bool UndefLower = isUndefLowerHalf(Mask);
15102 bool UndefUpper = isUndefUpperHalf(Mask);
15103 if (UndefLower == UndefUpper)
15104 return false;
15105
15106 unsigned HalfNumElts = HalfMask.size();
15107 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15108 HalfIdx1 = -1;
15109 HalfIdx2 = -1;
15110 for (unsigned i = 0; i != HalfNumElts; ++i) {
15111 int M = Mask[i + MaskIndexOffset];
15112 if (M < 0) {
15113 HalfMask[i] = M;
15114 continue;
15115 }
15116
15117 // Determine which of the 4 half vectors this element is from.
15118 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15119 int HalfIdx = M / HalfNumElts;
15120
15121 // Determine the element index into its half vector source.
15122 int HalfElt = M % HalfNumElts;
15123
15124 // We can shuffle with up to 2 half vectors, set the new 'half'
15125 // shuffle mask accordingly.
15126 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15127 HalfMask[i] = HalfElt;
15128 HalfIdx1 = HalfIdx;
15129 continue;
15130 }
15131 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15132 HalfMask[i] = HalfElt + HalfNumElts;
15133 HalfIdx2 = HalfIdx;
15134 continue;
15135 }
15136
15137 // Too many half vectors referenced.
15138 return false;
15139 }
15140
15141 return true;
15142}
15143
15144/// Given the output values from getHalfShuffleMask(), create a half width
15145/// shuffle of extracted vectors followed by an insert back to full width.
15146static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
15147 ArrayRef<int> HalfMask, int HalfIdx1,
15148 int HalfIdx2, bool UndefLower,
15149 SelectionDAG &DAG, bool UseConcat = false) {
15150 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((V1.getValueType() == V2.getValueType() && "Different sized vectors?"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15150, __PRETTY_FUNCTION__))
;
15151 assert(V1.getValueType().isSimple() && "Expecting only simple types")((V1.getValueType().isSimple() && "Expecting only simple types"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15151, __PRETTY_FUNCTION__))
;
15152
15153 MVT VT = V1.getSimpleValueType();
15154 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15155 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15156
15157 auto getHalfVector = [&](int HalfIdx) {
15158 if (HalfIdx < 0)
15159 return DAG.getUNDEF(HalfVT);
15160 SDValue V = (HalfIdx < 2 ? V1 : V2);
15161 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15162 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15163 DAG.getIntPtrConstant(HalfIdx, DL));
15164 };
15165
15166 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15167 SDValue Half1 = getHalfVector(HalfIdx1);
15168 SDValue Half2 = getHalfVector(HalfIdx2);
15169 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15170 if (UseConcat) {
15171 SDValue Op0 = V;
15172 SDValue Op1 = DAG.getUNDEF(HalfVT);
15173 if (UndefLower)
15174 std::swap(Op0, Op1);
15175 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15176 }
15177
15178 unsigned Offset = UndefLower ? HalfNumElts : 0;
15179 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15180 DAG.getIntPtrConstant(Offset, DL));
15181}
15182
15183/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15184/// This allows for fast cases such as subvector extraction/insertion
15185/// or shuffling smaller vector types which can lower more efficiently.
15186static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
15187 SDValue V2, ArrayRef<int> Mask,
15188 const X86Subtarget &Subtarget,
15189 SelectionDAG &DAG) {
15190 assert((VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15191, __PRETTY_FUNCTION__))
15191 "Expected 256-bit or 512-bit vector")(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15191, __PRETTY_FUNCTION__))
;
15192
15193 bool UndefLower = isUndefLowerHalf(Mask);
15194 if (!UndefLower && !isUndefUpperHalf(Mask))
15195 return SDValue();
15196
15197 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15198, __PRETTY_FUNCTION__))
15198 "Completely undef shuffle mask should have been simplified already")(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15198, __PRETTY_FUNCTION__))
;
15199
15200 // Upper half is undef and lower half is whole upper subvector.
15201 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15202 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15203 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15204 if (!UndefLower &&
15205 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15206 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15207 DAG.getIntPtrConstant(HalfNumElts, DL));
15208 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15209 DAG.getIntPtrConstant(0, DL));
15210 }
15211
15212 // Lower half is undef and upper half is whole lower subvector.
15213 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15214 if (UndefLower &&
15215 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15216 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15217 DAG.getIntPtrConstant(0, DL));
15218 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15219 DAG.getIntPtrConstant(HalfNumElts, DL));
15220 }
15221
15222 int HalfIdx1, HalfIdx2;
15223 SmallVector<int, 8> HalfMask(HalfNumElts);
15224 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15225 return SDValue();
15226
15227 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"
) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15227, __PRETTY_FUNCTION__))
;
15228
15229 // Only shuffle the halves of the inputs when useful.
15230 unsigned NumLowerHalves =
15231 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15232 unsigned NumUpperHalves =
15233 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15234 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"
) ? static_cast<void> (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15234, __PRETTY_FUNCTION__))
;
15235
15236 // Determine the larger pattern of undef/halves, then decide if it's worth
15237 // splitting the shuffle based on subtarget capabilities and types.
15238 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15239 if (!UndefLower) {
15240 // XXXXuuuu: no insert is needed.
15241 // Always extract lowers when setting lower - these are all free subreg ops.
15242 if (NumUpperHalves == 0)
15243 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15244 UndefLower, DAG);
15245
15246 if (NumUpperHalves == 1) {
15247 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15248 if (Subtarget.hasAVX2()) {
15249 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15250 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15251 !is128BitUnpackShuffleMask(HalfMask) &&
15252 (!isSingleSHUFPSMask(HalfMask) ||
15253 Subtarget.hasFastVariableShuffle()))
15254 return SDValue();
15255 // If this is a unary shuffle (assume that the 2nd operand is
15256 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15257 // are better off extracting the upper half of 1 operand and using a
15258 // narrow shuffle.
15259 if (EltWidth == 64 && V2.isUndef())
15260 return SDValue();
15261 }
15262 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15263 if (Subtarget.hasAVX512() && VT.is512BitVector())
15264 return SDValue();
15265 // Extract + narrow shuffle is better than the wide alternative.
15266 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15267 UndefLower, DAG);
15268 }
15269
15270 // Don't extract both uppers, instead shuffle and then extract.
15271 assert(NumUpperHalves == 2 && "Half vector count went wrong")((NumUpperHalves == 2 && "Half vector count went wrong"
) ? static_cast<void> (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15271, __PRETTY_FUNCTION__))
;
15272 return SDValue();
15273 }
15274
15275 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15276 if (NumUpperHalves == 0) {
15277 // AVX2 has efficient 64-bit element cross-lane shuffles.
15278 // TODO: Refine to account for unary shuffle, splat, and other masks?
15279 if (Subtarget.hasAVX2() && EltWidth == 64)
15280 return SDValue();
15281 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15282 if (Subtarget.hasAVX512() && VT.is512BitVector())
15283 return SDValue();
15284 // Narrow shuffle + insert is better than the wide alternative.
15285 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15286 UndefLower, DAG);
15287 }
15288
15289 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15290 return SDValue();
15291}
15292
15293/// Test whether the specified input (0 or 1) is in-place blended by the
15294/// given mask.
15295///
15296/// This returns true if the elements from a particular input are already in the
15297/// slot required by the given mask and require no permutation.
15298static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
15299 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15299, __PRETTY_FUNCTION__))
;
15300 int Size = Mask.size();
15301 for (int i = 0; i < Size; ++i)
15302 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
15303 return false;
15304
15305 return true;
15306}
15307
15308/// Handle case where shuffle sources are coming from the same 128-bit lane and
15309/// every lane can be represented as the same repeating mask - allowing us to
15310/// shuffle the sources with the repeating shuffle and then permute the result
15311/// to the destination lanes.
15312static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
15313 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15314 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15315 int NumElts = VT.getVectorNumElements();
15316 int NumLanes = VT.getSizeInBits() / 128;
15317 int NumLaneElts = NumElts / NumLanes;
15318
15319 // On AVX2 we may be able to just shuffle the lowest elements and then
15320 // broadcast the result.
15321 if (Subtarget.hasAVX2()) {
15322 for (unsigned BroadcastSize : {16, 32, 64}) {
15323 if (BroadcastSize <= VT.getScalarSizeInBits())
15324 continue;
15325 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15326
15327 // Attempt to match a repeating pattern every NumBroadcastElts,
15328 // accounting for UNDEFs but only references the lowest 128-bit
15329 // lane of the inputs.
15330 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15331 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15332 for (int j = 0; j != NumBroadcastElts; ++j) {
15333 int M = Mask[i + j];
15334 if (M < 0)
15335 continue;
15336 int &R = RepeatMask[j];
15337 if (0 != ((M % NumElts) / NumLaneElts))
15338 return false;
15339 if (0 <= R && R != M)
15340 return false;
15341 R = M;
15342 }
15343 return true;
15344 };
15345
15346 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15347 if (!FindRepeatingBroadcastMask(RepeatMask))
15348 continue;
15349
15350 // Shuffle the (lowest) repeated elements in place for broadcast.
15351 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15352
15353 // Shuffle the actual broadcast.
15354 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15355 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15356 for (int j = 0; j != NumBroadcastElts; ++j)
15357 BroadcastMask[i + j] = j;
15358 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15359 BroadcastMask);
15360 }
15361 }
15362
15363 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15364 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15365 return SDValue();
15366
15367 // Bail if we already have a repeated lane shuffle mask.
15368 SmallVector<int, 8> RepeatedShuffleMask;
15369 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
15370 return SDValue();
15371
15372 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15373 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
15374 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
15375 int NumSubLanes = NumLanes * SubLaneScale;
15376 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15377
15378 // Check that all the sources are coming from the same lane and see if we can
15379 // form a repeating shuffle mask (local to each sub-lane). At the same time,
15380 // determine the source sub-lane for each destination sub-lane.
15381 int TopSrcSubLane = -1;
15382 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15383 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
15384 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
15385 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
15386
15387 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15388 // Extract the sub-lane mask, check that it all comes from the same lane
15389 // and normalize the mask entries to come from the first lane.
15390 int SrcLane = -1;
15391 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15392 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15393 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15394 if (M < 0)
15395 continue;
15396 int Lane = (M % NumElts) / NumLaneElts;
15397 if ((0 <= SrcLane) && (SrcLane != Lane))
15398 return SDValue();
15399 SrcLane = Lane;
15400 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15401 SubLaneMask[Elt] = LocalM;
15402 }
15403
15404 // Whole sub-lane is UNDEF.
15405 if (SrcLane < 0)
15406 continue;
15407
15408 // Attempt to match against the candidate repeated sub-lane masks.
15409 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15410 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15411 for (int i = 0; i != NumSubLaneElts; ++i) {
15412 if (M1[i] < 0 || M2[i] < 0)
15413 continue;
15414 if (M1[i] != M2[i])
15415 return false;
15416 }
15417 return true;
15418 };
15419
15420 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15421 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15422 continue;
15423
15424 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15425 for (int i = 0; i != NumSubLaneElts; ++i) {
15426 int M = SubLaneMask[i];
15427 if (M < 0)
15428 continue;
15429 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15430, __PRETTY_FUNCTION__))
15430 "Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15430, __PRETTY_FUNCTION__))
;
15431 RepeatedSubLaneMask[i] = M;
15432 }
15433
15434 // Track the top most source sub-lane - by setting the remaining to UNDEF
15435 // we can greatly simplify shuffle matching.
15436 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15437 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15438 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15439 break;
15440 }
15441
15442 // Bail if we failed to find a matching repeated sub-lane mask.
15443 if (Dst2SrcSubLanes[DstSubLane] < 0)
15444 return SDValue();
15445 }
15446 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15447, __PRETTY_FUNCTION__))
15447 "Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15447, __PRETTY_FUNCTION__))
;
15448
15449 // Create a repeating shuffle mask for the entire vector.
15450 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15451 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15452 int Lane = SubLane / SubLaneScale;
15453 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15454 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15455 int M = RepeatedSubLaneMask[Elt];
15456 if (M < 0)
15457 continue;
15458 int Idx = (SubLane * NumSubLaneElts) + Elt;
15459 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15460 }
15461 }
15462 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15463
15464 // Shuffle each source sub-lane to its destination.
15465 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15466 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15467 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15468 if (SrcSubLane < 0)
15469 continue;
15470 for (int j = 0; j != NumSubLaneElts; ++j)
15471 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15472 }
15473
15474 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15475 SubLaneMask);
15476}
15477
15478static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
15479 bool &ForceV1Zero, bool &ForceV2Zero,
15480 unsigned &ShuffleImm, ArrayRef<int> Mask,
15481 const APInt &Zeroable) {
15482 int NumElts = VT.getVectorNumElements();
15483 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15485, __PRETTY_FUNCTION__))
15484 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15485, __PRETTY_FUNCTION__))
15485 "Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15485, __PRETTY_FUNCTION__))
;
15486 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15487, __PRETTY_FUNCTION__))
15487 "Illegal shuffle mask")((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15487, __PRETTY_FUNCTION__))
;
15488
15489 bool ZeroLane[2] = { true, true };
15490 for (int i = 0; i < NumElts; ++i)
15491 ZeroLane[i & 1] &= Zeroable[i];
15492
15493 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15494 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15495 ShuffleImm = 0;
15496 bool ShufpdMask = true;
15497 bool CommutableMask = true;
15498 for (int i = 0; i < NumElts; ++i) {
15499 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15500 continue;
15501 if (Mask[i] < 0)
15502 return false;
15503 int Val = (i & 6) + NumElts * (i & 1);
15504 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15505 if (Mask[i] < Val || Mask[i] > Val + 1)
15506 ShufpdMask = false;
15507 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15508 CommutableMask = false;
15509 ShuffleImm |= (Mask[i] % 2) << i;
15510 }
15511
15512 if (!ShufpdMask && !CommutableMask)
15513 return false;
15514
15515 if (!ShufpdMask && CommutableMask)
15516 std::swap(V1, V2);
15517
15518 ForceV1Zero = ZeroLane[0];
15519 ForceV2Zero = ZeroLane[1];
15520 return true;
15521}
15522
15523static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
15524 SDValue V2, ArrayRef<int> Mask,
15525 const APInt &Zeroable,
15526 const X86Subtarget &Subtarget,
15527 SelectionDAG &DAG) {
15528 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15529, __PRETTY_FUNCTION__))
15529 "Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15529, __PRETTY_FUNCTION__))
;
15530
15531 unsigned Immediate = 0;
15532 bool ForceV1Zero = false, ForceV2Zero = false;
15533 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15534 Mask, Zeroable))
15535 return SDValue();
15536
15537 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15538 if (ForceV1Zero)
15539 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15540 if (ForceV2Zero)
15541 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15542
15543 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15544 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15545}
15546
15547// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15548// by zeroable elements in the remaining 24 elements. Turn this into two
15549// vmovqb instructions shuffled together.
15550static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
15551 SDValue V1, SDValue V2,
15552 ArrayRef<int> Mask,
15553 const APInt &Zeroable,
15554 SelectionDAG &DAG) {
15555 assert(VT == MVT::v32i8 && "Unexpected type!")((VT == MVT::v32i8 && "Unexpected type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15555, __PRETTY_FUNCTION__))
;
15556
15557 // The first 8 indices should be every 8th element.
15558 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15559 return SDValue();
15560
15561 // Remaining elements need to be zeroable.
15562 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
15563 return SDValue();
15564
15565 V1 = DAG.getBitcast(MVT::v4i64, V1);
15566 V2 = DAG.getBitcast(MVT::v4i64, V2);
15567
15568 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15569 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15570
15571 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15572 // the upper bits of the result using an unpckldq.
15573 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15574 { 0, 1, 2, 3, 16, 17, 18, 19,
15575 4, 5, 6, 7, 20, 21, 22, 23 });
15576 // Insert the unpckldq into a zero vector to widen to v32i8.
15577 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15578 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15579 DAG.getIntPtrConstant(0, DL));
15580}
15581
15582
15583/// Handle lowering of 4-lane 64-bit floating point shuffles.
15584///
15585/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15586/// isn't available.
15587static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15588 const APInt &Zeroable, SDValue V1, SDValue V2,
15589 const X86Subtarget &Subtarget,
15590 SelectionDAG &DAG) {
15591 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15591, __PRETTY_FUNCTION__))
;
15592 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15592, __PRETTY_FUNCTION__))
;
15593 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15593, __PRETTY_FUNCTION__))
;
15594
15595 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15596 Subtarget, DAG))
15597 return V;
15598
15599 if (V2.isUndef()) {
15600 // Check for being able to broadcast a single element.
15601 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15602 Mask, Subtarget, DAG))
15603 return Broadcast;
15604
15605 // Use low duplicate instructions for masks that match their pattern.
15606 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
15607 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15608
15609 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15610 // Non-half-crossing single input shuffles can be lowered with an
15611 // interleaved permutation.
15612 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15613 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15614 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15615 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15616 }
15617
15618 // With AVX2 we have direct support for this permutation.
15619 if (Subtarget.hasAVX2())
15620 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15621 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15622
15623 // Try to create an in-lane repeating shuffle mask and then shuffle the
15624 // results into the target lanes.
15625 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15626 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15627 return V;
15628
15629 // Try to permute the lanes and then use a per-lane permute.
15630 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15631 Mask, DAG, Subtarget))
15632 return V;
15633
15634 // Otherwise, fall back.
15635 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15636 DAG, Subtarget);
15637 }
15638
15639 // Use dedicated unpack instructions for masks that match their pattern.
15640 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15641 return V;
15642
15643 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15644 Zeroable, Subtarget, DAG))
15645 return Blend;
15646
15647 // Check if the blend happens to exactly fit that of SHUFPD.
15648 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15649 Zeroable, Subtarget, DAG))
15650 return Op;
15651
15652 // If we have one input in place, then we can permute the other input and
15653 // blend the result.
15654 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
15655 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
15656 Subtarget, DAG);
15657
15658 // Try to create an in-lane repeating shuffle mask and then shuffle the
15659 // results into the target lanes.
15660 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15661 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15662 return V;
15663
15664 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15665 // shuffle. However, if we have AVX2 and either inputs are already in place,
15666 // we will be able to shuffle even across lanes the other input in a single
15667 // instruction so skip this pattern.
15668 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
15669 isShuffleMaskInputInPlace(1, Mask))))
15670 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
15671 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15672 return V;
15673
15674 // If we have VLX support, we can use VEXPAND.
15675 if (Subtarget.hasVLX())
15676 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15677 DAG, Subtarget))
15678 return V;
15679
15680 // If we have AVX2 then we always want to lower with a blend because an v4 we
15681 // can fully permute the elements.
15682 if (Subtarget.hasAVX2())
15683 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
15684 Subtarget, DAG);
15685
15686 // Otherwise fall back on generic lowering.
15687 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15688 Subtarget, DAG);
15689}
15690
15691/// Handle lowering of 4-lane 64-bit integer shuffles.
15692///
15693/// This routine is only called when we have AVX2 and thus a reasonable
15694/// instruction set for v4i64 shuffling..
15695static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15696 const APInt &Zeroable, SDValue V1, SDValue V2,
15697 const X86Subtarget &Subtarget,
15698 SelectionDAG &DAG) {
15699 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15699, __PRETTY_FUNCTION__))
;
15700 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15700, __PRETTY_FUNCTION__))
;
15701 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15701, __PRETTY_FUNCTION__))
;
15702 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15702, __PRETTY_FUNCTION__))
;
15703
15704 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15705 Subtarget, DAG))
15706 return V;
15707
15708 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15709 Zeroable, Subtarget, DAG))
15710 return Blend;
15711
15712 // Check for being able to broadcast a single element.
15713 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15714 Subtarget, DAG))
15715 return Broadcast;
15716
15717 if (V2.isUndef()) {
15718 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15719 // can use lower latency instructions that will operate on both lanes.
15720 SmallVector<int, 2> RepeatedMask;
15721 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15722 SmallVector<int, 4> PSHUFDMask;
15723 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
15724 return DAG.getBitcast(
15725 MVT::v4i64,
15726 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15727 DAG.getBitcast(MVT::v8i32, V1),
15728 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15729 }
15730
15731 // AVX2 provides a direct instruction for permuting a single input across
15732 // lanes.
15733 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15734 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15735 }
15736
15737 // Try to use shift instructions.
15738 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
15739 Zeroable, Subtarget, DAG))
15740 return Shift;
15741
15742 // If we have VLX support, we can use VALIGN or VEXPAND.
15743 if (Subtarget.hasVLX()) {
15744 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
15745 Subtarget, DAG))
15746 return Rotate;
15747
15748 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15749 DAG, Subtarget))
15750 return V;
15751 }
15752
15753 // Try to use PALIGNR.
15754 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15755 Subtarget, DAG))
15756 return Rotate;
15757
15758 // Use dedicated unpack instructions for masks that match their pattern.
15759 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15760 return V;
15761
15762 // If we have one input in place, then we can permute the other input and
15763 // blend the result.
15764 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
15765 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
15766 Subtarget, DAG);
15767
15768 // Try to create an in-lane repeating shuffle mask and then shuffle the
15769 // results into the target lanes.
15770 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15771 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15772 return V;
15773
15774 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15775 // shuffle. However, if we have AVX2 and either inputs are already in place,
15776 // we will be able to shuffle even across lanes the other input in a single
15777 // instruction so skip this pattern.
15778 if (!isShuffleMaskInputInPlace(0, Mask) &&
15779 !isShuffleMaskInputInPlace(1, Mask))
15780 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15781 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15782 return Result;
15783
15784 // Otherwise fall back on generic blend lowering.
15785 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
15786 Subtarget, DAG);
15787}
15788
15789/// Handle lowering of 8-lane 32-bit floating point shuffles.
15790///
15791/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
15792/// isn't available.
15793static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15794 const APInt &Zeroable, SDValue V1, SDValue V2,
15795 const X86Subtarget &Subtarget,
15796 SelectionDAG &DAG) {
15797 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15797, __PRETTY_FUNCTION__))
;
15798 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15798, __PRETTY_FUNCTION__))
;
15799 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15799, __PRETTY_FUNCTION__))
;
15800
15801 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
15802 Zeroable, Subtarget, DAG))
15803 return Blend;
15804
15805 // Check for being able to broadcast a single element.
15806 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
15807 Subtarget, DAG))
15808 return Broadcast;
15809
15810 // If the shuffle mask is repeated in each 128-bit lane, we have many more
15811 // options to efficiently lower the shuffle.
15812 SmallVector<int, 4> RepeatedMask;
15813 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
15814 assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15815, __PRETTY_FUNCTION__))
15815 "Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15815, __PRETTY_FUNCTION__))
;
15816
15817 // Use even/odd duplicate instructions for masks that match their pattern.
15818 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
15819 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
15820 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
15821 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
15822
15823 if (V2.isUndef())
15824 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
15825 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15826
15827 // Use dedicated unpack instructions for masks that match their pattern.
15828 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
15829 return V;
15830
15831 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
15832 // have already handled any direct blends.
15833 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
15834 }
15835
15836 // Try to create an in-lane repeating shuffle mask and then shuffle the
15837 // results into the target lanes.
15838 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15839 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15840 return V;
15841
15842 // If we have a single input shuffle with different shuffle patterns in the
15843 // two 128-bit lanes use the variable mask to VPERMILPS.
15844 if (V2.isUndef()) {
15845 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15846 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
15847 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
15848
15849 if (Subtarget.hasAVX2())
15850 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
15851
15852 // Otherwise, fall back.
15853 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
15854 DAG, Subtarget);
15855 }
15856
15857 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15858 // shuffle.
15859 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15860 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15861 return Result;
15862
15863 // If we have VLX support, we can use VEXPAND.
15864 if (Subtarget.hasVLX())
15865 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
15866 DAG, Subtarget))
15867 return V;
15868
15869 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15870 // since after split we get a more efficient code using vpunpcklwd and
15871 // vpunpckhwd instrs than vblend.
15872 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
15873 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
15874 Subtarget, DAG))
15875 return V;
15876
15877 // If we have AVX2 then we always want to lower with a blend because at v8 we
15878 // can fully permute the elements.
15879 if (Subtarget.hasAVX2())
15880 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
15881 Subtarget, DAG);
15882
15883 // Otherwise fall back on generic lowering.
15884 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
15885 Subtarget, DAG);
15886}
15887
15888/// Handle lowering of 8-lane 32-bit integer shuffles.
15889///
15890/// This routine is only called when we have AVX2 and thus a reasonable
15891/// instruction set for v8i32 shuffling..
15892static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15893 const APInt &Zeroable, SDValue V1, SDValue V2,
15894 const X86Subtarget &Subtarget,
15895 SelectionDAG &DAG) {
15896 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15896, __PRETTY_FUNCTION__))
;
15897 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15897, __PRETTY_FUNCTION__))
;
15898 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15898, __PRETTY_FUNCTION__))
;
15899 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15899, __PRETTY_FUNCTION__))
;
15900
15901 // Whenever we can lower this as a zext, that instruction is strictly faster
15902 // than any alternative. It also allows us to fold memory operands into the
15903 // shuffle in many cases.
15904 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
15905 Zeroable, Subtarget, DAG))
15906 return ZExt;
15907
15908 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15909 // since after split we get a more efficient code than vblend by using
15910 // vpunpcklwd and vpunpckhwd instrs.
15911 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
15912 !Subtarget.hasAVX512())
15913 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
15914 Subtarget, DAG))
15915 return V;
15916
15917 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
15918 Zeroable, Subtarget, DAG))
15919 return Blend;
15920
15921 // Check for being able to broadcast a single element.
15922 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
15923 Subtarget, DAG))
15924 return Broadcast;
15925
15926 // If the shuffle mask is repeated in each 128-bit lane we can use more
15927 // efficient instructions that mirror the shuffles across the two 128-bit
15928 // lanes.
15929 SmallVector<int, 4> RepeatedMask;
15930 bool Is128BitLaneRepeatedShuffle =
15931 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
15932 if (Is128BitLaneRepeatedShuffle) {
15933 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15933, __PRETTY_FUNCTION__))
;
15934 if (V2.isUndef())
15935 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
15936 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15937
15938 // Use dedicated unpack instructions for masks that match their pattern.
15939 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
15940 return V;
15941 }
15942
15943 // Try to use shift instructions.
15944 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
15945 Zeroable, Subtarget, DAG))
15946 return Shift;
15947
15948 // If we have VLX support, we can use VALIGN or EXPAND.
15949 if (Subtarget.hasVLX()) {
15950 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
15951 Subtarget, DAG))
15952 return Rotate;
15953
15954 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
15955 DAG, Subtarget))
15956 return V;
15957 }
15958
15959 // Try to use byte rotation instructions.
15960 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
15961 Subtarget, DAG))
15962 return Rotate;
15963
15964 // Try to create an in-lane repeating shuffle mask and then shuffle the
15965 // results into the target lanes.
15966 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15967 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
15968 return V;
15969
15970 // If the shuffle patterns aren't repeated but it is a single input, directly
15971 // generate a cross-lane VPERMD instruction.
15972 if (V2.isUndef()) {
15973 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15974 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
15975 }
15976
15977 // Assume that a single SHUFPS is faster than an alternative sequence of
15978 // multiple instructions (even if the CPU has a domain penalty).
15979 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15980 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
15981 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
15982 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
15983 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
15984 CastV1, CastV2, DAG);
15985 return DAG.getBitcast(MVT::v8i32, ShufPS);
15986 }
15987
15988 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15989 // shuffle.
15990 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15991 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
15992 return Result;
15993
15994 // Otherwise fall back on generic blend lowering.
15995 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
15996 Subtarget, DAG);
15997}
15998
15999/// Handle lowering of 16-lane 16-bit integer shuffles.
16000///
16001/// This routine is only called when we have AVX2 and thus a reasonable
16002/// instruction set for v16i16 shuffling..
16003static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16004 const APInt &Zeroable, SDValue V1, SDValue V2,
16005 const X86Subtarget &Subtarget,
16006 SelectionDAG &DAG) {
16007 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16007, __PRETTY_FUNCTION__))
;
16008 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16008, __PRETTY_FUNCTION__))
;
16009 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16009, __PRETTY_FUNCTION__))
;
16010 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16010, __PRETTY_FUNCTION__))
;
16011
16012 // Whenever we can lower this as a zext, that instruction is strictly faster
16013 // than any alternative. It also allows us to fold memory operands into the
16014 // shuffle in many cases.
16015 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16016 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16017 return ZExt;
16018
16019 // Check for being able to broadcast a single element.
16020 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16021 Subtarget, DAG))
16022 return Broadcast;
16023
16024 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16025 Zeroable, Subtarget, DAG))
16026 return Blend;
16027
16028 // Use dedicated unpack instructions for masks that match their pattern.
16029 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16030 return V;
16031
16032 // Use dedicated pack instructions for masks that match their pattern.
16033 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16034 Subtarget))
16035 return V;
16036
16037 // Try to use shift instructions.
16038 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
16039 Zeroable, Subtarget, DAG))
16040 return Shift;
16041
16042 // Try to use byte rotation instructions.
16043 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16044 Subtarget, DAG))
16045 return Rotate;
16046
16047 // Try to create an in-lane repeating shuffle mask and then shuffle the
16048 // results into the target lanes.
16049 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16050 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16051 return V;
16052
16053 if (V2.isUndef()) {
16054 // There are no generalized cross-lane shuffle operations available on i16
16055 // element types.
16056 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16057 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16058 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16059 return V;
16060
16061 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16062 DAG, Subtarget);
16063 }
16064
16065 SmallVector<int, 8> RepeatedMask;
16066 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16067 // As this is a single-input shuffle, the repeated mask should be
16068 // a strictly valid v8i16 mask that we can pass through to the v8i16
16069 // lowering to handle even the v16 case.
16070 return lowerV8I16GeneralSingleInputShuffle(
16071 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16072 }
16073 }
16074
16075 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16076 Zeroable, Subtarget, DAG))
16077 return PSHUFB;
16078
16079 // AVX512BWVL can lower to VPERMW.
16080 if (Subtarget.hasBWI() && Subtarget.hasVLX())
16081 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
16082
16083 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16084 // shuffle.
16085 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16086 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16087 return Result;
16088
16089 // Try to permute the lanes and then use a per-lane permute.
16090 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16091 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16092 return V;
16093
16094 // Otherwise fall back on generic lowering.
16095 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16096 Subtarget, DAG);
16097}
16098
16099/// Handle lowering of 32-lane 8-bit integer shuffles.
16100///
16101/// This routine is only called when we have AVX2 and thus a reasonable
16102/// instruction set for v32i8 shuffling..
16103static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16104 const APInt &Zeroable, SDValue V1, SDValue V2,
16105 const X86Subtarget &Subtarget,
16106 SelectionDAG &DAG) {
16107 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16107, __PRETTY_FUNCTION__))
;
16108 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16108, __PRETTY_FUNCTION__))
;
16109 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16109, __PRETTY_FUNCTION__))
;
16110 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16110, __PRETTY_FUNCTION__))
;
16111
16112 // Whenever we can lower this as a zext, that instruction is strictly faster
16113 // than any alternative. It also allows us to fold memory operands into the
16114 // shuffle in many cases.
16115 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16116 Zeroable, Subtarget, DAG))
16117 return ZExt;
16118
16119 // Check for being able to broadcast a single element.
16120 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16121 Subtarget, DAG))
16122 return Broadcast;
16123
16124 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16125 Zeroable, Subtarget, DAG))
16126 return Blend;
16127
16128 // Use dedicated unpack instructions for masks that match their pattern.
16129 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16130 return V;
16131
16132 // Use dedicated pack instructions for masks that match their pattern.
16133 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16134 Subtarget))
16135 return V;
16136
16137 // Try to use shift instructions.
16138 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
16139 Zeroable, Subtarget, DAG))
16140 return Shift;
16141
16142 // Try to use byte rotation instructions.
16143 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16144 Subtarget, DAG))
16145 return Rotate;
16146
16147 // Try to create an in-lane repeating shuffle mask and then shuffle the
16148 // results into the target lanes.
16149 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16150 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16151 return V;
16152
16153 // There are no generalized cross-lane shuffle operations available on i8
16154 // element types.
16155 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16156 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16157 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16158 return V;
16159
16160 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16161 DAG, Subtarget);
16162 }
16163
16164 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16165 Zeroable, Subtarget, DAG))
16166 return PSHUFB;
16167
16168 // AVX512VBMIVL can lower to VPERMB.
16169 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
16170 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
16171
16172 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16173 // shuffle.
16174 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16175 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16176 return Result;
16177
16178 // Try to permute the lanes and then use a per-lane permute.
16179 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16180 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16181 return V;
16182
16183 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16184 // by zeroable elements in the remaining 24 elements. Turn this into two
16185 // vmovqb instructions shuffled together.
16186 if (Subtarget.hasVLX())
16187 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16188 Mask, Zeroable, DAG))
16189 return V;
16190
16191 // Otherwise fall back on generic lowering.
16192 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16193 Subtarget, DAG);
16194}
16195
16196/// High-level routine to lower various 256-bit x86 vector shuffles.
16197///
16198/// This routine either breaks down the specific type of a 256-bit x86 vector
16199/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16200/// together based on the available instructions.
16201static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16202 SDValue V1, SDValue V2, const APInt &Zeroable,
16203 const X86Subtarget &Subtarget,
16204 SelectionDAG &DAG) {
16205 // If we have a single input to the zero element, insert that into V1 if we
16206 // can do so cheaply.
16207 int NumElts = VT.getVectorNumElements();
16208 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16209
16210 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16211 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16212 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16213 return Insertion;
16214
16215 // Handle special cases where the lower or upper half is UNDEF.
16216 if (SDValue V =
16217 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16218 return V;
16219
16220 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16221 // can check for those subtargets here and avoid much of the subtarget
16222 // querying in the per-vector-type lowering routines. With AVX1 we have
16223 // essentially *zero* ability to manipulate a 256-bit vector with integer
16224 // types. Since we'll use floating point types there eventually, just
16225 // immediately cast everything to a float and operate entirely in that domain.
16226 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16227 int ElementBits = VT.getScalarSizeInBits();
16228 if (ElementBits < 32) {
16229 // No floating point type available, if we can't use the bit operations
16230 // for masking/blending then decompose into 128-bit vectors.
16231 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16232 Subtarget, DAG))
16233 return V;
16234 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16235 return V;
16236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16237 }
16238
16239 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16240 VT.getVectorNumElements());
16241 V1 = DAG.getBitcast(FpVT, V1);
16242 V2 = DAG.getBitcast(FpVT, V2);
16243 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16244 }
16245
16246 switch (VT.SimpleTy) {
16247 case MVT::v4f64:
16248 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16249 case MVT::v4i64:
16250 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16251 case MVT::v8f32:
16252 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16253 case MVT::v8i32:
16254 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16255 case MVT::v16i16:
16256 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16257 case MVT::v32i8:
16258 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16259
16260 default:
16261 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16261)
;
16262 }
16263}
16264
16265/// Try to lower a vector shuffle as a 128-bit shuffles.
16266static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
16267 const APInt &Zeroable, SDValue V1, SDValue V2,
16268 const X86Subtarget &Subtarget,
16269 SelectionDAG &DAG) {
16270 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16271, __PRETTY_FUNCTION__))
16271 "Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16271, __PRETTY_FUNCTION__))
;
16272
16273 // To handle 256 bit vector requires VLX and most probably
16274 // function lowerV2X128VectorShuffle() is better solution.
16275 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16275, __PRETTY_FUNCTION__))
;
16276
16277 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16278 SmallVector<int, 4> WidenedMask;
16279 if (!canWidenShuffleElements(Mask, WidenedMask))
16280 return SDValue();
16281
16282 // Try to use an insert into a zero vector.
16283 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16284 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16285 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16286 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16287 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16288 DAG.getIntPtrConstant(0, DL));
16289 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16290 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16291 DAG.getIntPtrConstant(0, DL));
16292 }
16293
16294 // Check for patterns which can be matched with a single insert of a 256-bit
16295 // subvector.
16296 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
16297 {0, 1, 2, 3, 0, 1, 2, 3});
16298 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
16299 {0, 1, 2, 3, 8, 9, 10, 11})) {
16300 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16301 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16302 OnlyUsesV1 ? V1 : V2,
16303 DAG.getIntPtrConstant(0, DL));
16304 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16305 DAG.getIntPtrConstant(4, DL));
16306 }
16307
16308 assert(WidenedMask.size() == 4)((WidenedMask.size() == 4) ? static_cast<void> (0) : __assert_fail
("WidenedMask.size() == 4", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16308, __PRETTY_FUNCTION__))
;
16309
16310 // See if this is an insertion of the lower 128-bits of V2 into V1.
16311 bool IsInsert = true;
16312 int V2Index = -1;
16313 for (int i = 0; i < 4; ++i) {
16314 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16314, __PRETTY_FUNCTION__))
;
16315 if (WidenedMask[i] < 0)
16316 continue;
16317
16318 // Make sure all V1 subvectors are in place.
16319 if (WidenedMask[i] < 4) {
16320 if (WidenedMask[i] != i) {
16321 IsInsert = false;
16322 break;
16323 }
16324 } else {
16325 // Make sure we only have a single V2 index and its the lowest 128-bits.
16326 if (V2Index >= 0 || WidenedMask[i] != 4) {
16327 IsInsert = false;
16328 break;
16329 }
16330 V2Index = i;
16331 }
16332 }
16333 if (IsInsert && V2Index >= 0) {
16334 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16335 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16336 DAG.getIntPtrConstant(0, DL));
16337 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16338 }
16339
16340 // Try to lower to vshuf64x2/vshuf32x4.
16341 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16342 unsigned PermMask = 0;
16343 // Insure elements came from the same Op.
16344 for (int i = 0; i < 4; ++i) {
16345 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16345, __PRETTY_FUNCTION__))
;
16346 if (WidenedMask[i] < 0)
16347 continue;
16348
16349 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
16350 unsigned OpIndex = i / 2;
16351 if (Ops[OpIndex].isUndef())
16352 Ops[OpIndex] = Op;
16353 else if (Ops[OpIndex] != Op)
16354 return SDValue();
16355
16356 // Convert the 128-bit shuffle mask selection values into 128-bit selection
16357 // bits defined by a vshuf64x2 instruction's immediate control byte.
16358 PermMask |= (WidenedMask[i] % 4) << (i * 2);
16359 }
16360
16361 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16362 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16363}
16364
16365/// Handle lowering of 8-lane 64-bit floating point shuffles.
16366static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16367 const APInt &Zeroable, SDValue V1, SDValue V2,
16368 const X86Subtarget &Subtarget,
16369 SelectionDAG &DAG) {
16370 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16370, __PRETTY_FUNCTION__))
;
16371 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16371, __PRETTY_FUNCTION__))
;
16372 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16372, __PRETTY_FUNCTION__))
;
16373
16374 if (V2.isUndef()) {
16375 // Use low duplicate instructions for masks that match their pattern.
16376 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
16377 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16378
16379 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16380 // Non-half-crossing single input shuffles can be lowered with an
16381 // interleaved permutation.
16382 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16383 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16384 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16385 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16386 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16387 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16388 }
16389
16390 SmallVector<int, 4> RepeatedMask;
16391 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16392 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16393 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16394 }
16395
16396 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16397 V2, Subtarget, DAG))
16398 return Shuf128;
16399
16400 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16401 return Unpck;
16402
16403 // Check if the blend happens to exactly fit that of SHUFPD.
16404 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16405 Zeroable, Subtarget, DAG))
16406 return Op;
16407
16408 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16409 DAG, Subtarget))
16410 return V;
16411
16412 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16413 Zeroable, Subtarget, DAG))
16414 return Blend;
16415
16416 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
16417}
16418
16419/// Handle lowering of 16-lane 32-bit floating point shuffles.
16420static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16421 const APInt &Zeroable, SDValue V1, SDValue V2,
16422 const X86Subtarget &Subtarget,
16423 SelectionDAG &DAG) {
16424 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16424, __PRETTY_FUNCTION__))
;
16425 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16425, __PRETTY_FUNCTION__))
;
16426 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16426, __PRETTY_FUNCTION__))
;
16427
16428 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16429 // options to efficiently lower the shuffle.
16430 SmallVector<int, 4> RepeatedMask;
16431 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16432 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16432, __PRETTY_FUNCTION__))
;
16433
16434 // Use even/odd duplicate instructions for masks that match their pattern.
16435 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
16436 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16437 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
16438 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16439
16440 if (V2.isUndef())
16441 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16442 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16443
16444 // Use dedicated unpack instructions for masks that match their pattern.
16445 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16446 return V;
16447
16448 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16449 Zeroable, Subtarget, DAG))
16450 return Blend;
16451
16452 // Otherwise, fall back to a SHUFPS sequence.
16453 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16454 }
16455
16456 // If we have a single input shuffle with different shuffle patterns in the
16457 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16458 if (V2.isUndef() &&
16459 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16460 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16461 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16462 }
16463
16464 // If we have AVX512F support, we can use VEXPAND.
16465 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16466 V1, V2, DAG, Subtarget))
16467 return V;
16468
16469 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
16470}
16471
16472/// Handle lowering of 8-lane 64-bit integer shuffles.
16473static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16474 const APInt &Zeroable, SDValue V1, SDValue V2,
16475 const X86Subtarget &Subtarget,
16476 SelectionDAG &DAG) {
16477 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16477, __PRETTY_FUNCTION__))
;
16478 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16478, __PRETTY_FUNCTION__))
;
16479 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16479, __PRETTY_FUNCTION__))
;
16480
16481 if (V2.isUndef()) {
16482 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16483 // can use lower latency instructions that will operate on all four
16484 // 128-bit lanes.
16485 SmallVector<int, 2> Repeated128Mask;
16486 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16487 SmallVector<int, 4> PSHUFDMask;
16488 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
16489 return DAG.getBitcast(
16490 MVT::v8i64,
16491 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16492 DAG.getBitcast(MVT::v16i32, V1),
16493 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16494 }
16495
16496 SmallVector<int, 4> Repeated256Mask;
16497 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16498 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16499 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16500 }
16501
16502 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16503 V2, Subtarget, DAG))
16504 return Shuf128;
16505
16506 // Try to use shift instructions.
16507 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
16508 Zeroable, Subtarget, DAG))
16509 return Shift;
16510
16511 // Try to use VALIGN.
16512 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
16513 Subtarget, DAG))
16514 return Rotate;
16515
16516 // Try to use PALIGNR.
16517 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16518 Subtarget, DAG))
16519 return Rotate;
16520
16521 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16522 return Unpck;
16523 // If we have AVX512F support, we can use VEXPAND.
16524 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16525 DAG, Subtarget))
16526 return V;
16527
16528 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16529 Zeroable, Subtarget, DAG))
16530 return Blend;
16531
16532 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
16533}
16534
16535/// Handle lowering of 16-lane 32-bit integer shuffles.
16536static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16537 const APInt &Zeroable, SDValue V1, SDValue V2,
16538 const X86Subtarget &Subtarget,
16539 SelectionDAG &DAG) {
16540 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16540, __PRETTY_FUNCTION__))
;
16541 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16541, __PRETTY_FUNCTION__))
;
16542 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16542, __PRETTY_FUNCTION__))
;
16543
16544 // Whenever we can lower this as a zext, that instruction is strictly faster
16545 // than any alternative. It also allows us to fold memory operands into the
16546 // shuffle in many cases.
16547 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16548 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16549 return ZExt;
16550
16551 // If the shuffle mask is repeated in each 128-bit lane we can use more
16552 // efficient instructions that mirror the shuffles across the four 128-bit
16553 // lanes.
16554 SmallVector<int, 4> RepeatedMask;
16555 bool Is128BitLaneRepeatedShuffle =
16556 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16557 if (Is128BitLaneRepeatedShuffle) {
16558 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16558, __PRETTY_FUNCTION__))
;
16559 if (V2.isUndef())
16560 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16561 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16562
16563 // Use dedicated unpack instructions for masks that match their pattern.
16564 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16565 return V;
16566 }
16567
16568 // Try to use shift instructions.
16569 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return Shift;
16572
16573 // Try to use VALIGN.
16574 if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
16575 Subtarget, DAG))
16576 return Rotate;
16577
16578 // Try to use byte rotation instructions.
16579 if (Subtarget.hasBWI())
16580 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16581 Subtarget, DAG))
16582 return Rotate;
16583
16584 // Assume that a single SHUFPS is faster than using a permv shuffle.
16585 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16586 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16587 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16588 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16589 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16590 CastV1, CastV2, DAG);
16591 return DAG.getBitcast(MVT::v16i32, ShufPS);
16592 }
16593 // If we have AVX512F support, we can use VEXPAND.
16594 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
16595 DAG, Subtarget))
16596 return V;
16597
16598 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
16599 Zeroable, Subtarget, DAG))
16600 return Blend;
16601 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
16602}
16603
16604/// Handle lowering of 32-lane 16-bit integer shuffles.
16605static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16606 const APInt &Zeroable, SDValue V1, SDValue V2,
16607 const X86Subtarget &Subtarget,
16608 SelectionDAG &DAG) {
16609 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16609, __PRETTY_FUNCTION__))
;
16610 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16610, __PRETTY_FUNCTION__))
;
16611 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16611, __PRETTY_FUNCTION__))
;
16612 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16612, __PRETTY_FUNCTION__))
;
16613
16614 // Whenever we can lower this as a zext, that instruction is strictly faster
16615 // than any alternative. It also allows us to fold memory operands into the
16616 // shuffle in many cases.
16617 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16618 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16619 return ZExt;
16620
16621 // Use dedicated unpack instructions for masks that match their pattern.
16622 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
16623 return V;
16624
16625 // Try to use shift instructions.
16626 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
16627 Zeroable, Subtarget, DAG))
16628 return Shift;
16629
16630 // Try to use byte rotation instructions.
16631 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
16632 Subtarget, DAG))
16633 return Rotate;
16634
16635 if (V2.isUndef()) {
16636 SmallVector<int, 8> RepeatedMask;
16637 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
16638 // As this is a single-input shuffle, the repeated mask should be
16639 // a strictly valid v8i16 mask that we can pass through to the v8i16
16640 // lowering to handle even the v32 case.
16641 return lowerV8I16GeneralSingleInputShuffle(
16642 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
16643 }
16644 }
16645
16646 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
16647 Zeroable, Subtarget, DAG))
16648 return Blend;
16649
16650 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
16651 Zeroable, Subtarget, DAG))
16652 return PSHUFB;
16653
16654 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
16655}
16656
16657/// Handle lowering of 64-lane 8-bit integer shuffles.
16658static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16659 const APInt &Zeroable, SDValue V1, SDValue V2,
16660 const X86Subtarget &Subtarget,
16661 SelectionDAG &DAG) {
16662 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16662, __PRETTY_FUNCTION__))
;
16663 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16663, __PRETTY_FUNCTION__))
;
16664 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16664, __PRETTY_FUNCTION__))
;
16665 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16665, __PRETTY_FUNCTION__))
;
16666
16667 // Whenever we can lower this as a zext, that instruction is strictly faster
16668 // than any alternative. It also allows us to fold memory operands into the
16669 // shuffle in many cases.
16670 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16671 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16672 return ZExt;
16673
16674 // Use dedicated unpack instructions for masks that match their pattern.
16675 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
16676 return V;
16677
16678 // Use dedicated pack instructions for masks that match their pattern.
16679 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
16680 Subtarget))
16681 return V;
16682
16683 // Try to use shift instructions.
16684 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
16685 Zeroable, Subtarget, DAG))
16686 return Shift;
16687
16688 // Try to use byte rotation instructions.
16689 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
16690 Subtarget, DAG))
16691 return Rotate;
16692
16693 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
16694 Zeroable, Subtarget, DAG))
16695 return PSHUFB;
16696
16697 // VBMI can use VPERMV/VPERMV3 byte shuffles.
16698 if (Subtarget.hasVBMI())
16699 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
16700
16701 // Try to create an in-lane repeating shuffle mask and then shuffle the
16702 // results into the target lanes.
16703 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16704 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16705 return V;
16706
16707 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
16708 Zeroable, Subtarget, DAG))
16709 return Blend;
16710
16711 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16712 // shuffle.
16713 if (!V2.isUndef())
16714 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16715 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16716 return Result;
16717
16718 // FIXME: Implement direct support for this type!
16719 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
16720}
16721
16722/// High-level routine to lower various 512-bit x86 vector shuffles.
16723///
16724/// This routine either breaks down the specific type of a 512-bit x86 vector
16725/// shuffle or splits it into two 256-bit shuffles and fuses the results back
16726/// together based on the available instructions.
16727static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16728 MVT VT, SDValue V1, SDValue V2,
16729 const APInt &Zeroable,
16730 const X86Subtarget &Subtarget,
16731 SelectionDAG &DAG) {
16732 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16733, __PRETTY_FUNCTION__))
16733 "Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16733, __PRETTY_FUNCTION__))
;
16734
16735 // If we have a single input to the zero element, insert that into V1 if we
16736 // can do so cheaply.
16737 int NumElts = Mask.size();
16738 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16739
16740 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16741 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16742 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16743 return Insertion;
16744
16745 // Handle special cases where the lower or upper half is UNDEF.
16746 if (SDValue V =
16747 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16748 return V;
16749
16750 // Check for being able to broadcast a single element.
16751 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
16752 Subtarget, DAG))
16753 return Broadcast;
16754
16755 // Dispatch to each element type for lowering. If we don't have support for
16756 // specific element type shuffles at 512 bits, immediately split them and
16757 // lower them. Each lowering routine of a given type is allowed to assume that
16758 // the requisite ISA extensions for that element type are available.
16759 switch (VT.SimpleTy) {
16760 case MVT::v8f64:
16761 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16762 case MVT::v16f32:
16763 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16764 case MVT::v8i64:
16765 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16766 case MVT::v16i32:
16767 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16768 case MVT::v32i16:
16769 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16770 case MVT::v64i8:
16771 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16772
16773 default:
16774 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16774)
;
16775 }
16776}
16777
16778static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
16779 MVT VT, SDValue V1, SDValue V2,
16780 const X86Subtarget &Subtarget,
16781 SelectionDAG &DAG) {
16782 // Shuffle should be unary.
16783 if (!V2.isUndef())
16784 return SDValue();
16785
16786 int ShiftAmt = -1;
16787 int NumElts = Mask.size();
16788 for (int i = 0; i != NumElts; ++i) {
16789 int M = Mask[i];
16790 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16791, __PRETTY_FUNCTION__))
16791 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16791, __PRETTY_FUNCTION__))
;
16792 if (M < 0)
16793 continue;
16794
16795 // The first non-undef element determines our shift amount.
16796 if (ShiftAmt < 0) {
16797 ShiftAmt = M - i;
16798 // Need to be shifting right.
16799 if (ShiftAmt <= 0)
16800 return SDValue();
16801 }
16802 // All non-undef elements must shift by the same amount.
16803 if (ShiftAmt != M - i)
16804 return SDValue();
16805 }
16806 assert(ShiftAmt >= 0 && "All undef?")((ShiftAmt >= 0 && "All undef?") ? static_cast<
void> (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16806, __PRETTY_FUNCTION__))
;
16807
16808 // Great we found a shift right.
16809 MVT WideVT = VT;
16810 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
16811 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
16812 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
16813 DAG.getUNDEF(WideVT), V1,
16814 DAG.getIntPtrConstant(0, DL));
16815 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
16816 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
16817 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
16818 DAG.getIntPtrConstant(0, DL));
16819}
16820
16821// Determine if this shuffle can be implemented with a KSHIFT instruction.
16822// Returns the shift amount if possible or -1 if not. This is a simplified
16823// version of matchShuffleAsShift.
16824static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
16825 int MaskOffset, const APInt &Zeroable) {
16826 int Size = Mask.size();
16827
16828 auto CheckZeros = [&](int Shift, bool Left) {
16829 for (int j = 0; j < Shift; ++j)
16830 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
16831 return false;
16832
16833 return true;
16834 };
16835
16836 auto MatchShift = [&](int Shift, bool Left) {
16837 unsigned Pos = Left ? Shift : 0;
16838 unsigned Low = Left ? 0 : Shift;
16839 unsigned Len = Size - Shift;
16840 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
16841 };
16842
16843 for (int Shift = 1; Shift != Size; ++Shift)
16844 for (bool Left : {true, false})
16845 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
16846 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
16847 return Shift;
16848 }
16849
16850 return -1;
16851}
16852
16853
16854// Lower vXi1 vector shuffles.
16855// There is no a dedicated instruction on AVX-512 that shuffles the masks.
16856// The only way to shuffle bits is to sign-extend the mask vector to SIMD
16857// vector, shuffle and then truncate it back.
16858static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16859 MVT VT, SDValue V1, SDValue V2,
16860 const APInt &Zeroable,
16861 const X86Subtarget &Subtarget,
16862 SelectionDAG &DAG) {
16863 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16864, __PRETTY_FUNCTION__))
16864 "Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16864, __PRETTY_FUNCTION__))
;
16865
16866 int NumElts = Mask.size();
16867
16868 // Try to recognize shuffles that are just padding a subvector with zeros.
16869 int SubvecElts = 0;
16870 int Src = -1;
16871 for (int i = 0; i != NumElts; ++i) {
16872 if (Mask[i] >= 0) {
16873 // Grab the source from the first valid mask. All subsequent elements need
16874 // to use this same source.
16875 if (Src < 0)
16876 Src = Mask[i] / NumElts;
16877 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
16878 break;
16879 }
16880
16881 ++SubvecElts;
16882 }
16883 assert(SubvecElts != NumElts && "Identity shuffle?")((SubvecElts != NumElts && "Identity shuffle?") ? static_cast
<void> (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16883, __PRETTY_FUNCTION__))
;
16884
16885 // Clip to a power 2.
16886 SubvecElts = PowerOf2Floor(SubvecElts);
16887
16888 // Make sure the number of zeroable bits in the top at least covers the bits
16889 // not covered by the subvector.
16890 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
16891 assert(Src >= 0 && "Expected a source!")((Src >= 0 && "Expected a source!") ? static_cast<
void> (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16891, __PRETTY_FUNCTION__))
;
16892 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
16893 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
16894 Src == 0 ? V1 : V2,
16895 DAG.getIntPtrConstant(0, DL));
16896 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16897 DAG.getConstant(0, DL, VT),
16898 Extract, DAG.getIntPtrConstant(0, DL));
16899 }
16900
16901 // Try a simple shift right with undef elements. Later we'll try with zeros.
16902 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
16903 DAG))
16904 return Shift;
16905
16906 // Try to match KSHIFTs.
16907 unsigned Offset = 0;
16908 for (SDValue V : { V1, V2 }) {
16909 unsigned Opcode;
16910 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
16911 if (ShiftAmt >= 0) {
16912 MVT WideVT = VT;
16913 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
16914 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
16915 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
16916 DAG.getUNDEF(WideVT), V,
16917 DAG.getIntPtrConstant(0, DL));
16918 // Widened right shifts need two shifts to ensure we shift in zeroes.
16919 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
16920 int WideElts = WideVT.getVectorNumElements();
16921 // Shift left to put the original vector in the MSBs of the new size.
16922 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
16923 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
16924 // Increase the shift amount to account for the left shift.
16925 ShiftAmt += WideElts - NumElts;
16926 }
16927
16928 Res = DAG.getNode(Opcode, DL, WideVT, Res,
16929 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
16930 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
16931 DAG.getIntPtrConstant(0, DL));
16932 }
16933 Offset += NumElts; // Increment for next iteration.
16934 }
16935
16936
16937
16938 MVT ExtVT;
16939 switch (VT.SimpleTy) {
16940 default:
16941 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16941)
;
16942 case MVT::v2i1:
16943 ExtVT = MVT::v2i64;
16944 break;
16945 case MVT::v4i1:
16946 ExtVT = MVT::v4i32;
16947 break;
16948 case MVT::v8i1:
16949 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
16950 // shuffle.
16951 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
16952 break;
16953 case MVT::v16i1:
16954 // Take 512-bit type, unless we are avoiding 512-bit types and have the
16955 // 256-bit operation available.
16956 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
16957 break;
16958 case MVT::v32i1:
16959 // Take 512-bit type, unless we are avoiding 512-bit types and have the
16960 // 256-bit operation available.
16961 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((Subtarget.hasBWI() && "Expected AVX512BW support") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16961, __PRETTY_FUNCTION__))
;
16962 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
16963 break;
16964 case MVT::v64i1:
16965 ExtVT = MVT::v64i8;
16966 break;
16967 }
16968
16969 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
16970 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
16971
16972 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
16973 // i1 was sign extended we can use X86ISD::CVT2MASK.
16974 int NumElems = VT.getVectorNumElements();
16975 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
16976 (Subtarget.hasDQI() && (NumElems < 32)))
16977 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
16978 Shuffle, ISD::SETGT);
16979
16980 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
16981}
16982
16983/// Helper function that returns true if the shuffle mask should be
16984/// commuted to improve canonicalization.
16985static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
16986 int NumElements = Mask.size();
16987
16988 int NumV1Elements = 0, NumV2Elements = 0;
16989 for (int M : Mask)
16990 if (M < 0)
16991 continue;
16992 else if (M < NumElements)
16993 ++NumV1Elements;
16994 else
16995 ++NumV2Elements;
16996
16997 // Commute the shuffle as needed such that more elements come from V1 than
16998 // V2. This allows us to match the shuffle pattern strictly on how many
16999 // elements come from V1 without handling the symmetric cases.
17000 if (NumV2Elements > NumV1Elements)
17001 return true;
17002
17003 assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast
<void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17003, __PRETTY_FUNCTION__))
;
17004
17005 if (NumV2Elements == 0)
17006 return false;
17007
17008 // When the number of V1 and V2 elements are the same, try to minimize the
17009 // number of uses of V2 in the low half of the vector. When that is tied,
17010 // ensure that the sum of indices for V1 is equal to or lower than the sum
17011 // indices for V2. When those are equal, try to ensure that the number of odd
17012 // indices for V1 is lower than the number of odd indices for V2.
17013 if (NumV1Elements == NumV2Elements) {
17014 int LowV1Elements = 0, LowV2Elements = 0;
17015 for (int M : Mask.slice(0, NumElements / 2))
17016 if (M >= NumElements)
17017 ++LowV2Elements;
17018 else if (M >= 0)
17019 ++LowV1Elements;
17020 if (LowV2Elements > LowV1Elements)
17021 return true;
17022 if (LowV2Elements == LowV1Elements) {
17023 int SumV1Indices = 0, SumV2Indices = 0;
17024 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17025 if (Mask[i] >= NumElements)
17026 SumV2Indices += i;
17027 else if (Mask[i] >= 0)
17028 SumV1Indices += i;
17029 if (SumV2Indices < SumV1Indices)
17030 return true;
17031 if (SumV2Indices == SumV1Indices) {
17032 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17033 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17034 if (Mask[i] >= NumElements)
17035 NumV2OddIndices += i % 2;
17036 else if (Mask[i] >= 0)
17037 NumV1OddIndices += i % 2;
17038 if (NumV2OddIndices < NumV1OddIndices)
17039 return true;
17040 }
17041 }
17042 }
17043
17044 return false;
17045}
17046
17047/// Top-level lowering for x86 vector shuffles.
17048///
17049/// This handles decomposition, canonicalization, and lowering of all x86
17050/// vector shuffles. Most of the specific lowering strategies are encapsulated
17051/// above in helper routines. The canonicalization attempts to widen shuffles
17052/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17053/// s.t. only one of the two inputs needs to be tested, etc.
17054static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
17055 SelectionDAG &DAG) {
17056 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17057 ArrayRef<int> OrigMask = SVOp->getMask();
17058 SDValue V1 = Op.getOperand(0);
17059 SDValue V2 = Op.getOperand(1);
17060 MVT VT = Op.getSimpleValueType();
17061 int NumElements = VT.getVectorNumElements();
17062 SDLoc DL(Op);
17063 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17064
17065 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17066, __PRETTY_FUNCTION__))
17066 "Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17066, __PRETTY_FUNCTION__))
;
17067
17068 bool V1IsUndef = V1.isUndef();
17069 bool V2IsUndef = V2.isUndef();
17070 if (V1IsUndef && V2IsUndef)
17071 return DAG.getUNDEF(VT);
17072
17073 // When we create a shuffle node we put the UNDEF node to second operand,
17074 // but in some cases the first operand may be transformed to UNDEF.
17075 // In this case we should just commute the node.
17076 if (V1IsUndef)
17077 return DAG.getCommutedVectorShuffle(*SVOp);
17078
17079 // Check for non-undef masks pointing at an undef vector and make the masks
17080 // undef as well. This makes it easier to match the shuffle based solely on
17081 // the mask.
17082 if (V2IsUndef &&
17083 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17084 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
17085 for (int &M : NewMask)
17086 if (M >= NumElements)
17087 M = -1;
17088 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17089 }
17090
17091 // Check for illegal shuffle mask element index values.
17092 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17093 (void)MaskUpperLimit;
17094 assert(llvm::all_of(OrigMask,((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17096, __PRETTY_FUNCTION__))
17095 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17096, __PRETTY_FUNCTION__))
17096 "Out of bounds shuffle index")((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17096, __PRETTY_FUNCTION__))
;
17097
17098 // We actually see shuffles that are entirely re-arrangements of a set of
17099 // zero inputs. This mostly happens while decomposing complex shuffles into
17100 // simple ones. Directly lower these as a buildvector of zeros.
17101 APInt KnownUndef, KnownZero;
17102 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17103
17104 APInt Zeroable = KnownUndef | KnownZero;
17105 if (Zeroable.isAllOnesValue())
17106 return getZeroVector(VT, Subtarget, DAG, DL);
17107
17108 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17109
17110 // Try to collapse shuffles into using a vector type with fewer elements but
17111 // wider element types. We cap this to not form integers or floating point
17112 // elements wider than 64 bits, but it might be interesting to form i128
17113 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17114 SmallVector<int, 16> WidenedMask;
17115 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17116 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17117 // Shuffle mask widening should not interfere with a broadcast opportunity
17118 // by obfuscating the operands with bitcasts.
17119 // TODO: Avoid lowering directly from this top-level function: make this
17120 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17121 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17122 Subtarget, DAG))
17123 return Broadcast;
17124
17125 MVT NewEltVT = VT.isFloatingPoint()
17126 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
17127 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
17128 int NewNumElts = NumElements / 2;
17129 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17130 // Make sure that the new vector type is legal. For example, v2f64 isn't
17131 // legal on SSE1.
17132 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17133 if (V2IsZero) {
17134 // Modify the new Mask to take all zeros from the all-zero vector.
17135 // Choose indices that are blend-friendly.
17136 bool UsedZeroVector = false;
17137 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17138, __PRETTY_FUNCTION__))
17138 "V2's non-undef elements are used?!")((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17138, __PRETTY_FUNCTION__))
;
17139 for (int i = 0; i != NewNumElts; ++i)
17140 if (WidenedMask[i] == SM_SentinelZero) {
17141 WidenedMask[i] = i + NewNumElts;
17142 UsedZeroVector = true;
17143 }
17144 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17145 // some elements to be undef.
17146 if (UsedZeroVector)
17147 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17148 }
17149 V1 = DAG.getBitcast(NewVT, V1);
17150 V2 = DAG.getBitcast(NewVT, V2);
17151 return DAG.getBitcast(
17152 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17153 }
17154 }
17155
17156 // Commute the shuffle if it will improve canonicalization.
17157 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
17158 if (canonicalizeShuffleMaskWithCommute(Mask)) {
17159 ShuffleVectorSDNode::commuteMask(Mask);
17160 std::swap(V1, V2);
17161 }
17162
17163 if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
17164 return V;
17165
17166 // For each vector width, delegate to a specialized lowering routine.
17167 if (VT.is128BitVector())
17168 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17169
17170 if (VT.is256BitVector())
17171 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17172
17173 if (VT.is512BitVector())
17174 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17175
17176 if (Is1BitVector)
17177 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17178
17179 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17179)
;
17180}
17181
17182/// Try to lower a VSELECT instruction to a vector shuffle.
17183static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
17184 const X86Subtarget &Subtarget,
17185 SelectionDAG &DAG) {
17186 SDValue Cond = Op.getOperand(0);
17187 SDValue LHS = Op.getOperand(1);
17188 SDValue RHS = Op.getOperand(2);
17189 MVT VT = Op.getSimpleValueType();
17190
17191 // Only non-legal VSELECTs reach this lowering, convert those into generic
17192 // shuffles and re-use the shuffle lowering path for blends.
17193 SmallVector<int, 32> Mask;
17194 if (createShuffleMaskFromVSELECT(Mask, Cond))
17195 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17196
17197 return SDValue();
17198}
17199
17200SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17201 SDValue Cond = Op.getOperand(0);
17202 SDValue LHS = Op.getOperand(1);
17203 SDValue RHS = Op.getOperand(2);
17204
17205 // A vselect where all conditions and data are constants can be optimized into
17206 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17207 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
17208 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
17209 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
17210 return SDValue();
17211
17212 // Try to lower this to a blend-style vector shuffle. This can handle all
17213 // constant condition cases.
17214 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17215 return BlendOp;
17216
17217 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17218 // with patterns on the mask registers on AVX-512.
17219 MVT CondVT = Cond.getSimpleValueType();
17220 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17221 if (CondEltSize == 1)
17222 return Op;
17223
17224 // Variable blends are only legal from SSE4.1 onward.
17225 if (!Subtarget.hasSSE41())
17226 return SDValue();
17227
17228 SDLoc dl(Op);
17229 MVT VT = Op.getSimpleValueType();
17230 unsigned EltSize = VT.getScalarSizeInBits();
17231 unsigned NumElts = VT.getVectorNumElements();
17232
17233 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17234 // into an i1 condition so that we can use the mask-based 512-bit blend
17235 // instructions.
17236 if (VT.getSizeInBits() == 512) {
17237 // Build a mask by testing the condition against zero.
17238 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17239 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17240 DAG.getConstant(0, dl, CondVT),
17241 ISD::SETNE);
17242 // Now return a new VSELECT using the mask.
17243 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17244 }
17245
17246 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17247 if (CondEltSize != EltSize) {
17248 // If we don't have a sign splat, rely on the expansion.
17249 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17250 return SDValue();
17251
17252 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17253 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17254 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17255 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17256 }
17257
17258 // Only some types will be legal on some subtargets. If we can emit a legal
17259 // VSELECT-matching blend, return Op, and but if we need to expand, return
17260 // a null value.
17261 switch (VT.SimpleTy) {
17262 default:
17263 // Most of the vector types have blends past SSE4.1.
17264 return Op;
17265
17266 case MVT::v32i8:
17267 // The byte blends for AVX vectors were introduced only in AVX2.
17268 if (Subtarget.hasAVX2())
17269 return Op;
17270
17271 return SDValue();
17272
17273 case MVT::v8i16:
17274 case MVT::v16i16: {
17275 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17276 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17277 Cond = DAG.getBitcast(CastVT, Cond);
17278 LHS = DAG.getBitcast(CastVT, LHS);
17279 RHS = DAG.getBitcast(CastVT, RHS);
17280 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17281 return DAG.getBitcast(VT, Select);
17282 }
17283 }
17284}
17285
17286static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
17287 MVT VT = Op.getSimpleValueType();
17288 SDLoc dl(Op);
17289
17290 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
17291 return SDValue();
17292
17293 if (VT.getSizeInBits() == 8) {
17294 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
17295 Op.getOperand(0), Op.getOperand(1));
17296 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17297 }
17298
17299 if (VT == MVT::f32) {
17300 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17301 // the result back to FR32 register. It's only worth matching if the
17302 // result has a single use which is a store or a bitcast to i32. And in
17303 // the case of a store, it's not worth it if the index is a constant 0,
17304 // because a MOVSSmr can be used instead, which is smaller and faster.
17305 if (!Op.hasOneUse())
17306 return SDValue();
17307 SDNode *User = *Op.getNode()->use_begin();
17308 if ((User->getOpcode() != ISD::STORE ||
17309 isNullConstant(Op.getOperand(1))) &&
17310 (User->getOpcode() != ISD::BITCAST ||
17311 User->getValueType(0) != MVT::i32))
17312 return SDValue();
17313 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17314 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
17315 Op.getOperand(1));
17316 return DAG.getBitcast(MVT::f32, Extract);
17317 }
17318
17319 if (VT == MVT::i32 || VT == MVT::i64) {
17320 // ExtractPS/pextrq works with constant index.
17321 if (isa<ConstantSDNode>(Op.getOperand(1)))
17322 return Op;
17323 }
17324
17325 return SDValue();
17326}
17327
17328/// Extract one bit from mask vector, like v16i1 or v8i1.
17329/// AVX-512 feature.
17330static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
17331 const X86Subtarget &Subtarget) {
17332 SDValue Vec = Op.getOperand(0);
17333 SDLoc dl(Vec);
17334 MVT VecVT = Vec.getSimpleValueType();
17335 SDValue Idx = Op.getOperand(1);
17336 MVT EltVT = Op.getSimpleValueType();
17337
17338 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17339, __PRETTY_FUNCTION__))
17339 "Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17339, __PRETTY_FUNCTION__))
;
17340
17341 // variable index can't be handled in mask registers,
17342 // extend vector to VR512/128
17343 if (!isa<ConstantSDNode>(Idx)) {
17344 unsigned NumElts = VecVT.getVectorNumElements();
17345 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17346 // than extending to 128/256bit.
17347 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17348 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17349 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17350 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17351 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17352 }
17353
17354 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17355 if (IdxVal == 0) // the operation is legal
17356 return Op;
17357
17358 // Extend to natively supported kshift.
17359 unsigned NumElems = VecVT.getVectorNumElements();
17360 MVT WideVecVT = VecVT;
17361 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
17362 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17363 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
17364 DAG.getUNDEF(WideVecVT), Vec,
17365 DAG.getIntPtrConstant(0, dl));
17366 }
17367
17368 // Use kshiftr instruction to move to the lower element.
17369 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
17370 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17371
17372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17373 DAG.getIntPtrConstant(0, dl));
17374}
17375
17376SDValue
17377X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17378 SelectionDAG &DAG) const {
17379 SDLoc dl(Op);
17380 SDValue Vec = Op.getOperand(0);
17381 MVT VecVT = Vec.getSimpleValueType();
17382 SDValue Idx = Op.getOperand(1);
17383
17384 if (VecVT.getVectorElementType() == MVT::i1)
17385 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17386
17387 if (!isa<ConstantSDNode>(Idx)) {
17388 // Its more profitable to go through memory (1 cycles throughput)
17389 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
17390 // IACA tool was used to get performance estimation
17391 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17392 //
17393 // example : extractelement <16 x i8> %a, i32 %i
17394 //
17395 // Block Throughput: 3.00 Cycles
17396 // Throughput Bottleneck: Port5
17397 //
17398 // | Num Of | Ports pressure in cycles | |
17399 // | Uops | 0 - DV | 5 | 6 | 7 | |
17400 // ---------------------------------------------
17401 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
17402 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
17403 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
17404 // Total Num Of Uops: 4
17405 //
17406 //
17407 // Block Throughput: 1.00 Cycles
17408 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17409 //
17410 // | | Ports pressure in cycles | |
17411 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
17412 // ---------------------------------------------------------
17413 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
17414 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
17415 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
17416 // Total Num Of Uops: 4
17417
17418 return SDValue();
17419 }
17420
17421 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17422
17423 // If this is a 256-bit vector result, first extract the 128-bit vector and
17424 // then extract the element from the 128-bit vector.
17425 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
17426 // Get the 128-bit vector.
17427 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
17428 MVT EltVT = VecVT.getVectorElementType();
17429
17430 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
17431 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17431, __PRETTY_FUNCTION__))
;
17432
17433 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
17434 // this can be done with a mask.
17435 IdxVal &= ElemsPerChunk - 1;
17436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17437 DAG.getIntPtrConstant(IdxVal, dl));
17438 }
17439
17440 assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17440, __PRETTY_FUNCTION__))
;
17441
17442 MVT VT = Op.getSimpleValueType();
17443
17444 if (VT.getSizeInBits() == 16) {
17445 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
17446 // we're going to zero extend the register or fold the store (SSE41 only).
17447 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
17448 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
17449 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
17450 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17451 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17452
17453 // Transform it so it match pextrw which produces a 32-bit result.
17454 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
17455 Op.getOperand(0), Op.getOperand(1));
17456 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17457 }
17458
17459 if (Subtarget.hasSSE41())
17460 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
17461 return Res;
17462
17463 // TODO: We only extract a single element from v16i8, we can probably afford
17464 // to be more aggressive here before using the default approach of spilling to
17465 // stack.
17466 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
17467 // Extract either the lowest i32 or any i16, and extract the sub-byte.
17468 int DWordIdx = IdxVal / 4;
17469 if (DWordIdx == 0) {
17470 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17471 DAG.getBitcast(MVT::v4i32, Vec),
17472 DAG.getIntPtrConstant(DWordIdx, dl));
17473 int ShiftVal = (IdxVal % 4) * 8;
17474 if (ShiftVal != 0)
17475 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
17476 DAG.getConstant(ShiftVal, dl, MVT::i8));
17477 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17478 }
17479
17480 int WordIdx = IdxVal / 2;
17481 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17482 DAG.getBitcast(MVT::v8i16, Vec),
17483 DAG.getIntPtrConstant(WordIdx, dl));
17484 int ShiftVal = (IdxVal % 2) * 8;
17485 if (ShiftVal != 0)
17486 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17487 DAG.getConstant(ShiftVal, dl, MVT::i8));
17488 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17489 }
17490
17491 if (VT.getSizeInBits() == 32) {
17492 if (IdxVal == 0)
17493 return Op;
17494
17495 // SHUFPS the element to the lowest double word, then movss.
17496 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
17497 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17498 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17499 DAG.getIntPtrConstant(0, dl));
17500 }
17501
17502 if (VT.getSizeInBits() == 64) {
17503 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
17504 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
17505 // to match extract_elt for f64.
17506 if (IdxVal == 0)
17507 return Op;
17508
17509 // UNPCKHPD the element to the lowest double word, then movsd.
17510 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
17511 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
17512 int Mask[2] = { 1, -1 };
17513 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17515 DAG.getIntPtrConstant(0, dl));
17516 }
17517
17518 return SDValue();
17519}
17520
17521/// Insert one bit to mask vector, like v16i1 or v8i1.
17522/// AVX-512 feature.
17523static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
17524 const X86Subtarget &Subtarget) {
17525 SDLoc dl(Op);
17526 SDValue Vec = Op.getOperand(0);
17527 SDValue Elt = Op.getOperand(1);
17528 SDValue Idx = Op.getOperand(2);
17529 MVT VecVT = Vec.getSimpleValueType();
17530
17531 if (!isa<ConstantSDNode>(Idx)) {
17532 // Non constant index. Extend source and destination,
17533 // insert element and then truncate the result.
17534 unsigned NumElts = VecVT.getVectorNumElements();
17535 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17536 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17537 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
17538 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
17539 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
17540 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
17541 }
17542
17543 // Copy into a k-register, extract to v1i1 and insert_subvector.
17544 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
17545
17546 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
17547 Op.getOperand(2));
17548}
17549
17550SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
17551 SelectionDAG &DAG) const {
17552 MVT VT = Op.getSimpleValueType();
17553 MVT EltVT = VT.getVectorElementType();
17554 unsigned NumElts = VT.getVectorNumElements();
17555
17556 if (EltVT == MVT::i1)
17557 return InsertBitToMaskVector(Op, DAG, Subtarget);
17558
17559 SDLoc dl(Op);
17560 SDValue N0 = Op.getOperand(0);
17561 SDValue N1 = Op.getOperand(1);
17562 SDValue N2 = Op.getOperand(2);
17563
17564 auto *N2C = dyn_cast<ConstantSDNode>(N2);
17565 if (!N2C || N2C->getAPIntValue().uge(NumElts))
17566 return SDValue();
17567 uint64_t IdxVal = N2C->getZExtValue();
17568
17569 bool IsZeroElt = X86::isZeroNode(N1);
17570 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
17571
17572 // If we are inserting a element, see if we can do this more efficiently with
17573 // a blend shuffle with a rematerializable vector than a costly integer
17574 // insertion.
17575 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
17576 16 <= EltVT.getSizeInBits()) {
17577 SmallVector<int, 8> BlendMask;
17578 for (unsigned i = 0; i != NumElts; ++i)
17579 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
17580 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
17581 : getOnesVector(VT, DAG, dl);
17582 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
17583 }
17584
17585 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
17586 // into that, and then insert the subvector back into the result.
17587 if (VT.is256BitVector() || VT.is512BitVector()) {
17588 // With a 256-bit vector, we can insert into the zero element efficiently
17589 // using a blend if we have AVX or AVX2 and the right data type.
17590 if (VT.is256BitVector() && IdxVal == 0) {
17591 // TODO: It is worthwhile to cast integer to floating point and back
17592 // and incur a domain crossing penalty if that's what we'll end up
17593 // doing anyway after extracting to a 128-bit vector.
17594 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
17595 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
17596 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
17597 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
17598 DAG.getTargetConstant(1, dl, MVT::i8));
17599 }
17600 }
17601
17602 // Get the desired 128-bit vector chunk.
17603 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
17604
17605 // Insert the element into the desired chunk.
17606 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
17607 assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) :
__assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17607, __PRETTY_FUNCTION__))
;
17608 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
17609 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
17610
17611 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
17612 DAG.getIntPtrConstant(IdxIn128, dl));
17613
17614 // Insert the changed part back into the bigger vector
17615 return insert128BitVector(N0, V, IdxVal, DAG, dl);
17616 }
17617 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17617, __PRETTY_FUNCTION__))
;
17618
17619 // This will be just movd/movq/movss/movsd.
17620 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
17621 (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
17622 EltVT == MVT::i64)) {
17623 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
17624 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
17625 }
17626
17627 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
17628 // argument. SSE41 required for pinsrb.
17629 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
17630 unsigned Opc;
17631 if (VT == MVT::v8i16) {
17632 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17632, __PRETTY_FUNCTION__))
;
17633 Opc = X86ISD::PINSRW;
17634 } else {
17635 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector")
? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17635, __PRETTY_FUNCTION__))
;
17636 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17636, __PRETTY_FUNCTION__))
;
17637 Opc = X86ISD::PINSRB;
17638 }
17639
17640 if (N1.getValueType() != MVT::i32)
17641 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
17642 if (N2.getValueType() != MVT::i32)
17643 N2 = DAG.getIntPtrConstant(IdxVal, dl);
17644 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
17645 }
17646
17647 if (Subtarget.hasSSE41()) {
17648 if (EltVT == MVT::f32) {
17649 // Bits [7:6] of the constant are the source select. This will always be
17650 // zero here. The DAG Combiner may combine an extract_elt index into
17651 // these bits. For example (insert (extract, 3), 2) could be matched by
17652 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
17653 // Bits [5:4] of the constant are the destination select. This is the
17654 // value of the incoming immediate.
17655 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
17656 // combine either bitwise AND or insert of float 0.0 to set these bits.
17657
17658 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
17659 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
17660 // If this is an insertion of 32-bits into the low 32-bits of
17661 // a vector, we prefer to generate a blend with immediate rather
17662 // than an insertps. Blends are simpler operations in hardware and so
17663 // will always have equal or better performance than insertps.
17664 // But if optimizing for size and there's a load folding opportunity,
17665 // generate insertps because blendps does not have a 32-bit memory
17666 // operand form.
17667 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
17668 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
17669 DAG.getTargetConstant(1, dl, MVT::i8));
17670 }
17671 // Create this as a scalar to vector..
17672 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
17673 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
17674 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
17675 }
17676
17677 // PINSR* works with constant index.
17678 if (EltVT == MVT::i32 || EltVT == MVT::i64)
17679 return Op;
17680 }
17681
17682 return SDValue();
17683}
17684
17685static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
17686 SelectionDAG &DAG) {
17687 SDLoc dl(Op);
17688 MVT OpVT = Op.getSimpleValueType();
17689
17690 // It's always cheaper to replace a xor+movd with xorps and simplifies further
17691 // combines.
17692 if (X86::isZeroNode(Op.getOperand(0)))
17693 return getZeroVector(OpVT, Subtarget, DAG, dl);
17694
17695 // If this is a 256-bit vector result, first insert into a 128-bit
17696 // vector and then insert into the 256-bit vector.
17697 if (!OpVT.is128BitVector()) {
17698 // Insert into a 128-bit vector.
17699 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
17700 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
17701 OpVT.getVectorNumElements() / SizeFactor);
17702
17703 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
17704
17705 // Insert the 128-bit vector.
17706 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
17707 }
17708 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17709, __PRETTY_FUNCTION__))
17709 "Expected an SSE type!")((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17709, __PRETTY_FUNCTION__))
;
17710
17711 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
17712 if (OpVT == MVT::v4i32)
17713 return Op;
17714
17715 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
17716 return DAG.getBitcast(
17717 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
17718}
17719
17720// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
17721// simple superregister reference or explicit instructions to insert
17722// the upper bits of a vector.
17723static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
17724 SelectionDAG &DAG) {
17725 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ?
static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17725, __PRETTY_FUNCTION__))
;
17726
17727 return insert1BitVector(Op, DAG, Subtarget);
17728}
17729
17730static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
17731 SelectionDAG &DAG) {
17732 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17733, __PRETTY_FUNCTION__))
17733 "Only vXi1 extract_subvectors need custom lowering")((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17733, __PRETTY_FUNCTION__))
;
17734
17735 SDLoc dl(Op);
17736 SDValue Vec = Op.getOperand(0);
17737 SDValue Idx = Op.getOperand(1);
17738
17739 if (!isa<ConstantSDNode>(Idx))
17740 return SDValue();
17741
17742 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17743 if (IdxVal == 0) // the operation is legal
17744 return Op;
17745
17746 MVT VecVT = Vec.getSimpleValueType();
17747 unsigned NumElems = VecVT.getVectorNumElements();
17748
17749 // Extend to natively supported kshift.
17750 MVT WideVecVT = VecVT;
17751 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
17752 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17753 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
17754 DAG.getUNDEF(WideVecVT), Vec,
17755 DAG.getIntPtrConstant(0, dl));
17756 }
17757
17758 // Shift to the LSB.
17759 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
17760 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17761
17762 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
17763 DAG.getIntPtrConstant(0, dl));
17764}
17765
17766// Returns the appropriate wrapper opcode for a global reference.
17767unsigned X86TargetLowering::getGlobalWrapperKind(
17768 const GlobalValue *GV, const unsigned char OpFlags) const {
17769 // References to absolute symbols are never PC-relative.
17770 if (GV && GV->isAbsoluteSymbolRef())
17771 return X86ISD::Wrapper;
17772
17773 CodeModel::Model M = getTargetMachine().getCodeModel();
17774 if (Subtarget.isPICStyleRIPRel() &&
17775 (M == CodeModel::Small || M == CodeModel::Kernel))
17776 return X86ISD::WrapperRIP;
17777
17778 // GOTPCREL references must always use RIP.
17779 if (OpFlags == X86II::MO_GOTPCREL)
17780 return X86ISD::WrapperRIP;
17781
17782 return X86ISD::Wrapper;
17783}
17784
17785// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
17786// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
17787// one of the above mentioned nodes. It has to be wrapped because otherwise
17788// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
17789// be used to form addressing mode. These wrapped nodes will be selected
17790// into MOV32ri.
17791SDValue
17792X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
17793 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
17794
17795 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
17796 // global base reg.
17797 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
17798
17799 auto PtrVT = getPointerTy(DAG.getDataLayout());
17800 SDValue Result = DAG.getTargetConstantPool(
17801 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
17802 SDLoc DL(CP);
17803 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
17804 // With PIC, the address is actually $g + Offset.
17805 if (OpFlag) {
17806 Result =
17807 DAG.getNode(ISD::ADD, DL, PtrVT,
17808 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
17809 }
17810
17811 return Result;
17812}
17813
17814SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
17815 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
17816
17817 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
17818 // global base reg.
17819 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
17820
17821 auto PtrVT = getPointerTy(DAG.getDataLayout());
17822 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
17823 SDLoc DL(JT);
17824 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
17825
17826 // With PIC, the address is actually $g + Offset.
17827 if (OpFlag)
17828 Result =
17829 DAG.getNode(ISD::ADD, DL, PtrVT,
17830 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
17831
17832 return Result;
17833}
17834
17835SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
17836 SelectionDAG &DAG) const {
17837 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
17838}
17839
17840SDValue
17841X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
17842 // Create the TargetBlockAddressAddress node.
17843 unsigned char OpFlags =
17844 Subtarget.classifyBlockAddressReference();
17845 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
17846 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
17847 SDLoc dl(Op);
17848 auto PtrVT = getPointerTy(DAG.getDataLayout());
17849 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
17850 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
17851
17852 // With PIC, the address is actually $g + Offset.
17853 if (isGlobalRelativeToPICBase(OpFlags)) {
17854 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
17855 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
17856 }
17857
17858 return Result;
17859}
17860
17861/// Creates target global address or external symbol nodes for calls or
17862/// other uses.
17863SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
17864 bool ForCall) const {
17865 // Unpack the global address or external symbol.
17866 const SDLoc &dl = SDLoc(Op);
17867 const GlobalValue *GV = nullptr;
17868 int64_t Offset = 0;
17869 const char *ExternalSym = nullptr;
17870 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
17871 GV = G->getGlobal();
17872 Offset = G->getOffset();
17873 } else {
17874 const auto *ES = cast<ExternalSymbolSDNode>(Op);
17875 ExternalSym = ES->getSymbol();
17876 }
17877
17878 // Calculate some flags for address lowering.
17879 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
17880 unsigned char OpFlags;
17881 if (ForCall)
17882 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
17883 else
17884 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
17885 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
17886 bool NeedsLoad = isGlobalStubReference(OpFlags);
17887
17888 CodeModel::Model M = DAG.getTarget().getCodeModel();
17889 auto PtrVT = getPointerTy(DAG.getDataLayout());
17890 SDValue Result;
17891
17892 if (GV) {
17893 // Create a target global address if this is a global. If possible, fold the
17894 // offset into the global address reference. Otherwise, ADD it on later.
17895 int64_t GlobalOffset = 0;
17896 if (OpFlags == X86II::MO_NO_FLAG &&
17897 X86::isOffsetSuitableForCodeModel(Offset, M)) {
17898 std::swap(GlobalOffset, Offset);
17899 }
17900 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
17901 } else {
17902 // If this is not a global address, this must be an external symbol.
17903 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
17904 }
17905
17906 // If this is a direct call, avoid the wrapper if we don't need to do any
17907 // loads or adds. This allows SDAG ISel to match direct calls.
17908 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
17909 return Result;
17910
17911 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
17912
17913 // With PIC, the address is actually $g + Offset.
17914 if (HasPICReg) {
17915 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
17916 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
17917 }
17918
17919 // For globals that require a load from a stub to get the address, emit the
17920 // load.
17921 if (NeedsLoad)
17922 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
17923 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
17924
17925 // If there was a non-zero offset that we didn't fold, create an explicit
17926 // addition for it.
17927 if (Offset != 0)
17928 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
17929 DAG.getConstant(Offset, dl, PtrVT));
17930
17931 return Result;
17932}
17933
17934SDValue
17935X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
17936 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
17937}
17938
17939static SDValue
17940GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
17941 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
17942 unsigned char OperandFlags, bool LocalDynamic = false) {
17943 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
17944 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
17945 SDLoc dl(GA);
17946 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
17947 GA->getValueType(0),
17948 GA->getOffset(),
17949 OperandFlags);
17950
17951 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
17952 : X86ISD::TLSADDR;
17953
17954 if (InFlag) {
17955 SDValue Ops[] = { Chain, TGA, *InFlag };
17956 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
17957 } else {
17958 SDValue Ops[] = { Chain, TGA };
17959 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
17960 }
17961
17962 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
17963 MFI.setAdjustsStack(true);
17964 MFI.setHasCalls(true);
17965
17966 SDValue Flag = Chain.getValue(1);
17967 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
17968}
17969
17970// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
17971static SDValue
17972LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
17973 const EVT PtrVT) {
17974 SDValue InFlag;
17975 SDLoc dl(GA); // ? function entry point might be better
17976 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
17977 DAG.getNode(X86ISD::GlobalBaseReg,
17978 SDLoc(), PtrVT), InFlag);
17979 InFlag = Chain.getValue(1);
17980
17981 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
17982}
17983
17984// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
17985static SDValue
17986LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
17987 const EVT PtrVT) {
17988 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
17989 X86::RAX, X86II::MO_TLSGD);
17990}
17991
17992static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
17993 SelectionDAG &DAG,
17994 const EVT PtrVT,
17995 bool is64Bit) {
17996 SDLoc dl(GA);
17997
17998 // Get the start address of the TLS block for this module.
17999 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
18000 .getInfo<X86MachineFunctionInfo>();
18001 MFI->incNumLocalDynamicTLSAccesses();
18002
18003 SDValue Base;
18004 if (is64Bit) {
18005 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
18006 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18007 } else {
18008 SDValue InFlag;
18009 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18010 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
18011 InFlag = Chain.getValue(1);
18012 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
18013 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18014 }
18015
18016 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18017 // of Base.
18018
18019 // Build x@dtpoff.
18020 unsigned char OperandFlags = X86II::MO_DTPOFF;
18021 unsigned WrapperKind = X86ISD::Wrapper;
18022 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18023 GA->getValueType(0),
18024 GA->getOffset(), OperandFlags);
18025 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18026
18027 // Add x@dtpoff with the base.
18028 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18029}
18030
18031// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18032static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18033 const EVT PtrVT, TLSModel::Model model,
18034 bool is64Bit, bool isPIC) {
18035 SDLoc dl(GA);
18036
18037 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18038 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
18039 is64Bit ? 257 : 256));
18040
18041 SDValue ThreadPointer =
18042 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18043 MachinePointerInfo(Ptr));
18044
18045 unsigned char OperandFlags = 0;
18046 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18047 // initialexec.
18048 unsigned WrapperKind = X86ISD::Wrapper;
18049 if (model == TLSModel::LocalExec) {
18050 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18051 } else if (model == TLSModel::InitialExec) {
18052 if (is64Bit) {
18053 OperandFlags = X86II::MO_GOTTPOFF;
18054 WrapperKind = X86ISD::WrapperRIP;
18055 } else {
18056 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18057 }
18058 } else {
18059 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18059)
;
18060 }
18061
18062 // emit "addl x@ntpoff,%eax" (local exec)
18063 // or "addl x@indntpoff,%eax" (initial exec)
18064 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18065 SDValue TGA =
18066 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18067 GA->getOffset(), OperandFlags);
18068 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18069
18070 if (model == TLSModel::InitialExec) {
18071 if (isPIC && !is64Bit) {
18072 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18073 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18074 Offset);
18075 }
18076
18077 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18078 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18079 }
18080
18081 // The address of the thread local variable is the add of the thread
18082 // pointer with the offset of the variable.
18083 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18084}
18085
18086SDValue
18087X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18088
18089 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18090
18091 if (DAG.getTarget().useEmulatedTLS())
18092 return LowerToTLSEmulatedModel(GA, DAG);
18093
18094 const GlobalValue *GV = GA->getGlobal();
18095 auto PtrVT = getPointerTy(DAG.getDataLayout());
18096 bool PositionIndependent = isPositionIndependent();
18097
18098 if (Subtarget.isTargetELF()) {
18099 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18100 switch (model) {
18101 case TLSModel::GeneralDynamic:
18102 if (Subtarget.is64Bit())
18103 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18104 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18105 case TLSModel::LocalDynamic:
18106 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
18107 Subtarget.is64Bit());
18108 case TLSModel::InitialExec:
18109 case TLSModel::LocalExec:
18110 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18111 PositionIndependent);
18112 }
18113 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18113)
;
18114 }
18115
18116 if (Subtarget.isTargetDarwin()) {
18117 // Darwin only has one model of TLS. Lower to that.
18118 unsigned char OpFlag = 0;
18119 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
18120 X86ISD::WrapperRIP : X86ISD::Wrapper;
18121
18122 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18123 // global base reg.
18124 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18125 if (PIC32)
18126 OpFlag = X86II::MO_TLVP_PIC_BASE;
18127 else
18128 OpFlag = X86II::MO_TLVP;
18129 SDLoc DL(Op);
18130 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18131 GA->getValueType(0),
18132 GA->getOffset(), OpFlag);
18133 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18134
18135 // With PIC32, the address is actually $g + Offset.
18136 if (PIC32)
18137 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18138 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18139 Offset);
18140
18141 // Lowering the machine isd will make sure everything is in the right
18142 // location.
18143 SDValue Chain = DAG.getEntryNode();
18144 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18145 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18146 SDValue Args[] = { Chain, Offset };
18147 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18148 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
18149 DAG.getIntPtrConstant(0, DL, true),
18150 Chain.getValue(1), DL);
18151
18152 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18153 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18154 MFI.setAdjustsStack(true);
18155
18156 // And our return value (tls address) is in the standard call return value
18157 // location.
18158 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18159 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18160 }
18161
18162 if (Subtarget.isOSWindows()) {
18163 // Just use the implicit TLS architecture
18164 // Need to generate something similar to:
18165 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18166 // ; from TEB
18167 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18168 // mov rcx, qword [rdx+rcx*8]
18169 // mov eax, .tls$:tlsvar
18170 // [rax+rcx] contains the address
18171 // Windows 64bit: gs:0x58
18172 // Windows 32bit: fs:__tls_array
18173
18174 SDLoc dl(GA);
18175 SDValue Chain = DAG.getEntryNode();
18176
18177 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18178 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18179 // use its literal value of 0x2C.
18180 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
18181 ? Type::getInt8PtrTy(*DAG.getContext(),
18182 256)
18183 : Type::getInt32PtrTy(*DAG.getContext(),
18184 257));
18185
18186 SDValue TlsArray = Subtarget.is64Bit()
18187 ? DAG.getIntPtrConstant(0x58, dl)
18188 : (Subtarget.isTargetWindowsGNU()
18189 ? DAG.getIntPtrConstant(0x2C, dl)
18190 : DAG.getExternalSymbol("_tls_array", PtrVT));
18191
18192 SDValue ThreadPointer =
18193 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18194
18195 SDValue res;
18196 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18197 res = ThreadPointer;
18198 } else {
18199 // Load the _tls_index variable
18200 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18201 if (Subtarget.is64Bit())
18202 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18203 MachinePointerInfo(), MVT::i32);
18204 else
18205 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18206
18207 auto &DL = DAG.getDataLayout();
18208 SDValue Scale =
18209 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18210 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18211
18212 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18213 }
18214
18215 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18216
18217 // Get the offset of start of .tls section
18218 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18219 GA->getValueType(0),
18220 GA->getOffset(), X86II::MO_SECREL);
18221 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18222
18223 // The address of the thread local variable is the add of the thread
18224 // pointer with the offset of the variable.
18225 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18226 }
18227
18228 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18228)
;
18229}
18230
18231/// Lower SRA_PARTS and friends, which return two i32 values
18232/// and take a 2 x i32 value to shift plus a shift amount.
18233/// TODO: Can this be moved to general expansion code?
18234static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
18235 assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18235, __PRETTY_FUNCTION__))
;
18236 MVT VT = Op.getSimpleValueType();
18237 unsigned VTBits = VT.getSizeInBits();
18238 SDLoc dl(Op);
18239 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
18240 SDValue ShOpLo = Op.getOperand(0);
18241 SDValue ShOpHi = Op.getOperand(1);
18242 SDValue ShAmt = Op.getOperand(2);
18243 // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
18244 // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
18245 // during isel.
18246 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18247 DAG.getConstant(VTBits - 1, dl, MVT::i8));
18248 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
18249 DAG.getConstant(VTBits - 1, dl, MVT::i8))
18250 : DAG.getConstant(0, dl, VT);
18251
18252 SDValue Tmp2, Tmp3;
18253 if (Op.getOpcode() == ISD::SHL_PARTS) {
18254 Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
18255 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
18256 } else {
18257 Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
18258 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
18259 }
18260
18261 // If the shift amount is larger or equal than the width of a part we can't
18262 // rely on the results of shld/shrd. Insert a test and select the appropriate
18263 // values for large shift amounts.
18264 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18265 DAG.getConstant(VTBits, dl, MVT::i8));
18266 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
18267 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
18268
18269 SDValue Hi, Lo;
18270 if (Op.getOpcode() == ISD::SHL_PARTS) {
18271 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18272 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18273 } else {
18274 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18275 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18276 }
18277
18278 return DAG.getMergeValues({ Lo, Hi }, dl);
18279}
18280
18281static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
18282 SelectionDAG &DAG) {
18283 MVT VT = Op.getSimpleValueType();
18284 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18285, __PRETTY_FUNCTION__))
18285 "Unexpected funnel shift opcode!")(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18285, __PRETTY_FUNCTION__))
;
18286
18287 SDLoc DL(Op);
18288 SDValue Op0 = Op.getOperand(0);
18289 SDValue Op1 = Op.getOperand(1);
18290 SDValue Amt = Op.getOperand(2);
18291
18292 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
18293
18294 if (VT.isVector()) {
18295 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((Subtarget.hasVBMI2() && "Expected VBMI2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18295, __PRETTY_FUNCTION__))
;
18296
18297 if (IsFSHR)
18298 std::swap(Op0, Op1);
18299
18300 APInt APIntShiftAmt;
18301 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
18302 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
18303 return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
18304 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18305 }
18306
18307 return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
18308 Op0, Op1, Amt);
18309 }
18310
18311 assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18312, __PRETTY_FUNCTION__))
18312 "Unexpected funnel shift type!")(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18312, __PRETTY_FUNCTION__))
;
18313
18314 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
18315 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
18316 if (!OptForSize && Subtarget.isSHLDSlow())
18317 return SDValue();
18318
18319 if (IsFSHR)
18320 std::swap(Op0, Op1);
18321
18322 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
18323 if (VT == MVT::i16)
18324 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
18325 DAG.getConstant(15, DL, Amt.getValueType()));
18326
18327 unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
18328 return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
18329}
18330
18331// Try to use a packed vector operation to handle i64 on 32-bit targets when
18332// AVX512DQ is enabled.
18333static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
18334 const X86Subtarget &Subtarget) {
18335 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::UINT_TO_FP) && "Unexpected opcode!") ? static_cast<
void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18336, __PRETTY_FUNCTION__))
18336 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!")(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::UINT_TO_FP) && "Unexpected opcode!") ? static_cast<
void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18336, __PRETTY_FUNCTION__))
;
18337 SDValue Src = Op.getOperand(0);
18338 MVT SrcVT = Src.getSimpleValueType();
18339 MVT VT = Op.getSimpleValueType();
18340
18341 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18342 (VT != MVT::f32 && VT != MVT::f64))
18343 return SDValue();
18344
18345 // Pack the i64 into a vector, do the operation and extract.
18346
18347 // Using 256-bit to ensure result is 128-bits for f32 case.
18348 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18349 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18350 MVT VecVT = MVT::getVectorVT(VT, NumElts);
18351
18352 SDLoc dl(Op);
18353 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18354 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18355 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18356 DAG.getIntPtrConstant(0, dl));
18357}
18358
18359static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
18360 const X86Subtarget &Subtarget) {
18361 switch (Opcode) {
18362 case ISD::SINT_TO_FP:
18363 // TODO: Handle wider types with AVX/AVX512.
18364 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
18365 return false;
18366 // CVTDQ2PS or (V)CVTDQ2PD
18367 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
18368
18369 case ISD::UINT_TO_FP:
18370 // TODO: Handle wider types and i64 elements.
18371 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
18372 return false;
18373 // VCVTUDQ2PS or VCVTUDQ2PD
18374 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
18375
18376 default:
18377 return false;
18378 }
18379}
18380
18381/// Given a scalar cast operation that is extracted from a vector, try to
18382/// vectorize the cast op followed by extraction. This will avoid an expensive
18383/// round-trip between XMM and GPR.
18384static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
18385 const X86Subtarget &Subtarget) {
18386 // TODO: This could be enhanced to handle smaller integer types by peeking
18387 // through an extend.
18388 SDValue Extract = Cast.getOperand(0);
18389 MVT DestVT = Cast.getSimpleValueType();
18390 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18391 !isa<ConstantSDNode>(Extract.getOperand(1)))
18392 return SDValue();
18393
18394 // See if we have a 128-bit vector cast op for this type of cast.
18395 SDValue VecOp = Extract.getOperand(0);
18396 MVT FromVT = VecOp.getSimpleValueType();
18397 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
18398 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
18399 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
18400 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
18401 return SDValue();
18402
18403 // If we are extracting from a non-zero element, first shuffle the source
18404 // vector to allow extracting from element zero.
18405 SDLoc DL(Cast);
18406 if (!isNullConstant(Extract.getOperand(1))) {
18407 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
18408 Mask[0] = Extract.getConstantOperandVal(1);
18409 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
18410 }
18411 // If the source vector is wider than 128-bits, extract the low part. Do not
18412 // create an unnecessarily wide vector cast op.
18413 if (FromVT != Vec128VT)
18414 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
18415
18416 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
18417 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
18418 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
18419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
18420 DAG.getIntPtrConstant(0, DL));
18421}
18422
18423SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
18424 SelectionDAG &DAG) const {
18425 SDValue Src = Op.getOperand(0);
18426 MVT SrcVT = Src.getSimpleValueType();
18427 MVT VT = Op.getSimpleValueType();
18428 SDLoc dl(Op);
18429
18430 if (VT == MVT::f128)
18431 return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
18432
18433 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
18434 return Extract;
18435
18436 if (SrcVT.isVector()) {
18437 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
18438 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
18439 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
18440 DAG.getUNDEF(SrcVT)));
18441 }
18442 return SDValue();
18443 }
18444
18445 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18446, __PRETTY_FUNCTION__))
18446 "Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18446, __PRETTY_FUNCTION__))
;
18447
18448 // These are really Legal; return the operand so the caller accepts it as
18449 // Legal.
18450 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
18451 return Op;
18452 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
18453 return Op;
18454
18455 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
18456 return V;
18457
18458 SDValue ValueToStore = Op.getOperand(0);
18459 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
18460 !Subtarget.is64Bit())
18461 // Bitcasting to f64 here allows us to do a single 64-bit store from
18462 // an SSE register, avoiding the store forwarding penalty that would come
18463 // with two 32-bit stores.
18464 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
18465
18466 unsigned Size = SrcVT.getSizeInBits()/8;
18467 MachineFunction &MF = DAG.getMachineFunction();
18468 auto PtrVT = getPointerTy(MF.getDataLayout());
18469 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
18470 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
18471 SDValue Chain = DAG.getStore(
18472 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
18473 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
18474 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
18475}
18476
18477SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
18478 SDValue StackSlot,
18479 SelectionDAG &DAG) const {
18480 // Build the FILD
18481 SDLoc DL(Op);
18482 SDVTList Tys;
18483 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
18484 if (useSSE)
18485 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
18486 else
18487 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
18488
18489 unsigned ByteSize = SrcVT.getSizeInBits() / 8;
18490
18491 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
18492 MachineMemOperand *LoadMMO;
18493 if (FI) {
18494 int SSFI = FI->getIndex();
18495 LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
18496 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
18497 MachineMemOperand::MOLoad, ByteSize, ByteSize);
18498 } else {
18499 LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
18500 StackSlot = StackSlot.getOperand(1);
18501 }
18502 SDValue FILDOps[] = {Chain, StackSlot};
18503 SDValue Result =
18504 DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
18505 Tys, FILDOps, SrcVT, LoadMMO);
18506
18507 if (useSSE) {
18508 Chain = Result.getValue(1);
18509 SDValue InFlag = Result.getValue(2);
18510
18511 // FIXME: Currently the FST is glued to the FILD_FLAG. This
18512 // shouldn't be necessary except that RFP cannot be live across
18513 // multiple blocks. When stackifier is fixed, they can be uncoupled.
18514 MachineFunction &MF = DAG.getMachineFunction();
18515 unsigned SSFISize = Op.getValueSizeInBits() / 8;
18516 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
18517 auto PtrVT = getPointerTy(MF.getDataLayout());
18518 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
18519 Tys = DAG.getVTList(MVT::Other);
18520 SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
18521 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
18522 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
18523 MachineMemOperand::MOStore, SSFISize, SSFISize);
18524
18525 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
18526 Op.getValueType(), StoreMMO);
18527 Result = DAG.getLoad(
18528 Op.getValueType(), DL, Chain, StackSlot,
18529 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
18530 }
18531
18532 return Result;
18533}
18534
18535/// Horizontal vector math instructions may be slower than normal math with
18536/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
18537/// implementation, and likely shuffle complexity of the alternate sequence.
18538static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
18539 const X86Subtarget &Subtarget) {
18540 bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
18541 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
18542 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
18543}
18544
18545/// 64-bit unsigned integer to double expansion.
18546static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
18547 const X86Subtarget &Subtarget) {
18548 // This algorithm is not obvious. Here it is what we're trying to output:
18549 /*
18550 movq %rax, %xmm0
18551 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
18552 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
18553 #ifdef __SSE3__
18554 haddpd %xmm0, %xmm0
18555 #else
18556 pshufd $0x4e, %xmm0, %xmm1
18557 addpd %xmm1, %xmm0
18558 #endif
18559 */
18560
18561 SDLoc dl(Op);
18562 LLVMContext *Context = DAG.getContext();
18563
18564 // Build some magic constants.
18565 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
18566 Constant *C0 = ConstantDataVector::get(*Context, CV0);
18567 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
18568 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
18569
18570 SmallVector<Constant*,2> CV1;
18571 CV1.push_back(
18572 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
18573 APInt(64, 0x4330000000000000ULL))));
18574 CV1.push_back(
18575 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
18576 APInt(64, 0x4530000000000000ULL))));
18577 Constant *C1 = ConstantVector::get(CV1);
18578 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
18579
18580 // Load the 64-bit value into an XMM register.
18581 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
18582 Op.getOperand(0));
18583 SDValue CLod0 =
18584 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
18585 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
18586 /* Alignment = */ 16);
18587 SDValue Unpck1 =
18588 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
18589
18590 SDValue CLod1 =
18591 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
18592 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
18593 /* Alignment = */ 16);
18594 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
18595 // TODO: Are there any fast-math-flags to propagate here?
18596 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
18597 SDValue Result;
18598
18599 if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
18600 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
18601 } else {
18602 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
18603 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
18604 }
18605
18606 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
18607 DAG.getIntPtrConstant(0, dl));
18608}
18609
18610/// 32-bit unsigned integer to float expansion.
18611static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
18612 const X86Subtarget &Subtarget) {
18613 SDLoc dl(Op);
18614 // FP constant to bias correct the final result.
18615 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
18616 MVT::f64);
18617
18618 // Load the 32-bit value into an XMM register.
18619 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
18620 Op.getOperand(0));
18621
18622 // Zero out the upper parts of the register.
18623 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
18624
18625 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
18626 DAG.getBitcast(MVT::v2f64, Load),
18627 DAG.getIntPtrConstant(0, dl));
18628
18629 // Or the load with the bias.
18630 SDValue Or = DAG.getNode(
18631 ISD::OR, dl, MVT::v2i64,
18632 DAG.getBitcast(MVT::v2i64,
18633 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
18634 DAG.getBitcast(MVT::v2i64,
18635 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
18636 Or =
18637 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
18638 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
18639
18640 // Subtract the bias.
18641 // TODO: Are there any fast-math-flags to propagate here?
18642 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
18643
18644 // Handle final rounding.
18645 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
18646}
18647
18648static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
18649 const X86Subtarget &Subtarget,
18650 const SDLoc &DL) {
18651 if (Op.getSimpleValueType() != MVT::v2f64)
18652 return SDValue();
18653
18654 SDValue N0 = Op.getOperand(0);
18655 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18655, __PRETTY_FUNCTION__))
;
18656
18657 // Legalize to v4i32 type.
18658 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
18659 DAG.getUNDEF(MVT::v2i32));
18660
18661 if (Subtarget.hasAVX512())
18662 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
18663
18664 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
18665 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
18666 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
18667 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
18668
18669 // Two to the power of half-word-size.
18670 SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
18671
18672 // Clear upper part of LO, lower HI.
18673 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
18674 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
18675
18676 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
18677 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
18678 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
18679
18680 // Add the two halves.
18681 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
18682}
18683
18684static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
18685 const X86Subtarget &Subtarget) {
18686 // The algorithm is the following:
18687 // #ifdef __SSE4_1__
18688 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
18689 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
18690 // (uint4) 0x53000000, 0xaa);
18691 // #else
18692 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
18693 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
18694 // #endif
18695 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
18696 // return (float4) lo + fhi;
18697
18698 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
18699 // reassociate the two FADDs, and if we do that, the algorithm fails
18700 // spectacularly (PR24512).
18701 // FIXME: If we ever have some kind of Machine FMF, this should be marked
18702 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
18703 // there's also the MachineCombiner reassociations happening on Machine IR.
18704 if (DAG.getTarget().Options.UnsafeFPMath)
18705 return SDValue();
18706
18707 SDLoc DL(Op);
18708 SDValue V = Op->getOperand(0);
18709 MVT VecIntVT = V.getSimpleValueType();
18710 bool Is128 = VecIntVT == MVT::v4i32;
18711 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
18712 // If we convert to something else than the supported type, e.g., to v4f64,
18713 // abort early.
18714 if (VecFloatVT != Op->getSimpleValueType(0))
18715 return SDValue();
18716
18717 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18718, __PRETTY_FUNCTION__))
18718 "Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18718, __PRETTY_FUNCTION__))
;
18719
18720 // In the #idef/#else code, we have in common:
18721 // - The vector of constants:
18722 // -- 0x4b000000
18723 // -- 0x53000000
18724 // - A shift:
18725 // -- v >> 16
18726
18727 // Create the splat vector for 0x4b000000.
18728 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
18729 // Create the splat vector for 0x53000000.
18730 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
18731
18732 // Create the right shift.
18733 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
18734 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
18735
18736 SDValue Low, High;
18737 if (Subtarget.hasSSE41()) {
18738 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
18739 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
18740 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
18741 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
18742 // Low will be bitcasted right away, so do not bother bitcasting back to its
18743 // original type.
18744 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
18745 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
18746 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
18747 // (uint4) 0x53000000, 0xaa);
18748 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
18749 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
18750 // High will be bitcasted right away, so do not bother bitcasting back to
18751 // its original type.
18752 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
18753 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
18754 } else {
18755 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
18756 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
18757 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
18758 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
18759
18760 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
18761 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
18762 }
18763
18764 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
18765 SDValue VecCstFAdd = DAG.getConstantFP(
18766 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
18767
18768 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
18769 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
18770 // TODO: Are there any fast-math-flags to propagate here?
18771 SDValue FHigh =
18772 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
18773 // return (float4) lo + fhi;
18774 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
18775 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
18776}
18777
18778static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
18779 const X86Subtarget &Subtarget) {
18780 SDValue N0 = Op.getOperand(0);
18781 MVT SrcVT = N0.getSimpleValueType();
18782 SDLoc dl(Op);
18783
18784 switch (SrcVT.SimpleTy) {
18785 default:
18786 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18786)
;
18787 case MVT::v2i32:
18788 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
18789 case MVT::v4i32:
18790 case MVT::v8i32:
18791 assert(!Subtarget.hasAVX512())((!Subtarget.hasAVX512()) ? static_cast<void> (0) : __assert_fail
("!Subtarget.hasAVX512()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18791, __PRETTY_FUNCTION__))
;
18792 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
18793 }
18794}
18795
18796SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
18797 SelectionDAG &DAG) const {
18798 SDValue N0 = Op.getOperand(0);
18799 SDLoc dl(Op);
18800 auto PtrVT = getPointerTy(DAG.getDataLayout());
18801 MVT SrcVT = N0.getSimpleValueType();
18802 MVT DstVT = Op.getSimpleValueType();
18803
18804 if (DstVT == MVT::f128)
18805 return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
18806
18807 if (DstVT.isVector())
18808 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
18809
18810 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
18811 return Extract;
18812
18813 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
18814 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
18815 // Conversions from unsigned i32 to f32/f64 are legal,
18816 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
18817 return Op;
18818 }
18819
18820 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
18821 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
18822 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
18823 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
18824 }
18825
18826 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
18827 return V;
18828
18829 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
18830 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
18831 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
18832 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
18833 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
18834 return SDValue();
18835
18836 // Make a 64-bit buffer, and use it to build an FILD.
18837 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
18838 if (SrcVT == MVT::i32) {
18839 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
18840 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
18841 StackSlot, MachinePointerInfo());
18842 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
18843 OffsetSlot, MachinePointerInfo());
18844 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
18845 return Fild;
18846 }
18847
18848 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18848, __PRETTY_FUNCTION__))
;
18849 SDValue ValueToStore = Op.getOperand(0);
18850 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
18851 // Bitcasting to f64 here allows us to do a single 64-bit store from
18852 // an SSE register, avoiding the store forwarding penalty that would come
18853 // with two 32-bit stores.
18854 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
18855 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
18856 MachinePointerInfo());
18857 // For i64 source, we need to add the appropriate power of 2 if the input
18858 // was negative. This is the same as the optimization in
18859 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
18860 // we must be careful to do the computation in x87 extended precision, not
18861 // in SSE. (The generic code can't know it's OK to do this, or how to.)
18862 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
18863 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
18864 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
18865 MachineMemOperand::MOLoad, 8, 8);
18866
18867 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
18868 SDValue Ops[] = { Store, StackSlot };
18869 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
18870 MVT::i64, MMO);
18871
18872 APInt FF(32, 0x5F800000ULL);
18873
18874 // Check whether the sign bit is set.
18875 SDValue SignSet = DAG.getSetCC(
18876 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
18877 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
18878
18879 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
18880 SDValue FudgePtr = DAG.getConstantPool(
18881 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
18882
18883 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
18884 SDValue Zero = DAG.getIntPtrConstant(0, dl);
18885 SDValue Four = DAG.getIntPtrConstant(4, dl);
18886 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
18887 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
18888
18889 // Load the value out, extending it from f32 to f80.
18890 // FIXME: Avoid the extend by constructing the right constant pool?
18891 SDValue Fudge = DAG.getExtLoad(
18892 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
18893 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
18894 /* Alignment = */ 4);
18895 // Extend everything to 80 bits to force it to be done on x87.
18896 // TODO: Are there any fast-math-flags to propagate here?
18897 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
18898 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
18899 DAG.getIntPtrConstant(0, dl));
18900}
18901
18902// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
18903// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
18904// just return an SDValue().
18905// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
18906// to i16, i32 or i64, and we lower it to a legal sequence and return the
18907// result.
18908SDValue
18909X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
18910 bool IsSigned) const {
18911 SDLoc DL(Op);
18912
18913 EVT DstTy = Op.getValueType();
18914 EVT TheVT = Op.getOperand(0).getValueType();
18915 auto PtrVT = getPointerTy(DAG.getDataLayout());
18916
18917 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
18918 // f16 must be promoted before using the lowering in this routine.
18919 // fp128 does not use this lowering.
18920 return SDValue();
18921 }
18922
18923 // If using FIST to compute an unsigned i64, we'll need some fixup
18924 // to handle values above the maximum signed i64. A FIST is always
18925 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
18926 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
18927
18928 if (!IsSigned && DstTy != MVT::i64) {
18929 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
18930 // The low 32 bits of the fist result will have the correct uint32 result.
18931 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18931, __PRETTY_FUNCTION__))
;
18932 DstTy = MVT::i64;
18933 }
18934
18935 assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18937, __PRETTY_FUNCTION__))
18936 DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18937, __PRETTY_FUNCTION__))
18937 "Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18937, __PRETTY_FUNCTION__))
;
18938
18939 // We lower FP->int64 into FISTP64 followed by a load from a temporary
18940 // stack slot.
18941 MachineFunction &MF = DAG.getMachineFunction();
18942 unsigned MemSize = DstTy.getStoreSize();
18943 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
18944 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
18945
18946 SDValue Chain = DAG.getEntryNode();
18947 SDValue Value = Op.getOperand(0);
18948 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
18949
18950 if (UnsignedFixup) {
18951 //
18952 // Conversion to unsigned i64 is implemented with a select,
18953 // depending on whether the source value fits in the range
18954 // of a signed i64. Let Thresh be the FP equivalent of
18955 // 0x8000000000000000ULL.
18956 //
18957 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
18958 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
18959 // Fist-to-mem64 FistSrc
18960 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
18961 // to XOR'ing the high 32 bits with Adjust.
18962 //
18963 // Being a power of 2, Thresh is exactly representable in all FP formats.
18964 // For X87 we'd like to use the smallest FP type for this constant, but
18965 // for DAG type consistency we have to match the FP operand type.
18966
18967 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
18968 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
18969 bool LosesInfo = false;
18970 if (TheVT == MVT::f64)
18971 // The rounding mode is irrelevant as the conversion should be exact.
18972 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
18973 &LosesInfo);
18974 else if (TheVT == MVT::f80)
18975 Status = Thresh.convert(APFloat::x87DoubleExtended(),
18976 APFloat::rmNearestTiesToEven, &LosesInfo);
18977
18978 assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18979, __PRETTY_FUNCTION__))
18979 "FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18979, __PRETTY_FUNCTION__))
;
18980
18981 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
18982
18983 SDValue Cmp = DAG.getSetCC(DL,
18984 getSetCCResultType(DAG.getDataLayout(),
18985 *DAG.getContext(), TheVT),
18986 Value, ThreshVal, ISD::SETLT);
18987 Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
18988 DAG.getConstant(0, DL, MVT::i64),
18989 DAG.getConstant(APInt::getSignMask(64),
18990 DL, MVT::i64));
18991 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
18992 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
18993 *DAG.getContext(), TheVT),
18994 Value, ThreshVal, ISD::SETLT);
18995 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
18996 }
18997
18998 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
18999
19000 // FIXME This causes a redundant load/store if the SSE-class value is already
19001 // in memory, such as if it is on the callstack.
19002 if (isScalarFPTypeInSSEReg(TheVT)) {
19003 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19003, __PRETTY_FUNCTION__))
;
19004 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
19005 SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
19006 SDValue Ops[] = { Chain, StackSlot };
19007
19008 unsigned FLDSize = TheVT.getStoreSize();
19009 assert(FLDSize <= MemSize && "Stack slot not big enough")((FLDSize <= MemSize && "Stack slot not big enough"
) ? static_cast<void> (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19009, __PRETTY_FUNCTION__))
;
19010 MachineMemOperand *MMO = MF.getMachineMemOperand(
19011 MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
19012 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
19013 Chain = Value.getValue(1);
19014 }
19015
19016 // Build the FP_TO_INT*_IN_MEM
19017 MachineMemOperand *MMO = MF.getMachineMemOperand(
19018 MPI, MachineMemOperand::MOStore, MemSize, MemSize);
19019 SDValue Ops[] = { Chain, Value, StackSlot };
19020 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
19021 DAG.getVTList(MVT::Other),
19022 Ops, DstTy, MMO);
19023
19024 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
19025
19026 // If we need an unsigned fixup, XOR the result with adjust.
19027 if (UnsignedFixup)
19028 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
19029
19030 return Res;
19031}
19032
19033static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
19034 const X86Subtarget &Subtarget) {
19035 MVT VT = Op.getSimpleValueType();
19036 SDValue In = Op.getOperand(0);
19037 MVT InVT = In.getSimpleValueType();
19038 SDLoc dl(Op);
19039 unsigned Opc = Op.getOpcode();
19040
19041 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19041, __PRETTY_FUNCTION__))
;
19042 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19043, __PRETTY_FUNCTION__))
19043 "Unexpected extension opcode")(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19043, __PRETTY_FUNCTION__))
;
19044 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19045, __PRETTY_FUNCTION__))
19045 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19045, __PRETTY_FUNCTION__))
;
19046 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19049, __PRETTY_FUNCTION__))
19047 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19049, __PRETTY_FUNCTION__))
19048 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19049, __PRETTY_FUNCTION__))
19049 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19049, __PRETTY_FUNCTION__))
;
19050 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19053, __PRETTY_FUNCTION__))
19051 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19053, __PRETTY_FUNCTION__))
19052 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19053, __PRETTY_FUNCTION__))
19053 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19053, __PRETTY_FUNCTION__))
;
19054
19055 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
19056
19057 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
19058 if (InVT == MVT::v8i8) {
19059 if (VT != MVT::v8i64)
19060 return SDValue();
19061
19062 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
19063 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
19064 return DAG.getNode(ExtendInVecOpc, dl, VT, In);
19065 }
19066
19067 if (Subtarget.hasInt256())
19068 return Op;
19069
19070 // Optimize vectors in AVX mode:
19071 //
19072 // v8i16 -> v8i32
19073 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
19074 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
19075 // Concat upper and lower parts.
19076 //
19077 // v4i32 -> v4i64
19078 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
19079 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
19080 // Concat upper and lower parts.
19081 //
19082 MVT HalfVT = VT.getHalfNumVectorElementsVT();
19083 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
19084
19085 // Short-circuit if we can determine that each 128-bit half is the same value.
19086 // Otherwise, this is difficult to match and optimize.
19087 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
19088 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
19089 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
19090
19091 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
19092 SDValue Undef = DAG.getUNDEF(InVT);
19093 bool NeedZero = Opc == ISD::ZERO_EXTEND;
19094 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
19095 OpHi = DAG.getBitcast(HalfVT, OpHi);
19096
19097 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19098}
19099
19100// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
19101static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
19102 const SDLoc &dl, SelectionDAG &DAG) {
19103 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19103, __PRETTY_FUNCTION__))
;
19104 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19105 DAG.getIntPtrConstant(0, dl));
19106 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19107 DAG.getIntPtrConstant(8, dl));
19108 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
19109 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
19110 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
19111 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19112}
19113
19114static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
19115 const X86Subtarget &Subtarget,
19116 SelectionDAG &DAG) {
19117 MVT VT = Op->getSimpleValueType(0);
19118 SDValue In = Op->getOperand(0);
19119 MVT InVT = In.getSimpleValueType();
19120 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19120, __PRETTY_FUNCTION__))
;
19121 SDLoc DL(Op);
19122 unsigned NumElts = VT.getVectorNumElements();
19123
19124 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
19125 // avoids a constant pool load.
19126 if (VT.getVectorElementType() != MVT::i8) {
19127 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
19128 return DAG.getNode(ISD::SRL, DL, VT, Extend,
19129 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
19130 }
19131
19132 // Extend VT if BWI is not supported.
19133 MVT ExtVT = VT;
19134 if (!Subtarget.hasBWI()) {
19135 // If v16i32 is to be avoided, we'll need to split and concatenate.
19136 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19137 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
19138
19139 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19140 }
19141
19142 // Widen to 512-bits if VLX is not supported.
19143 MVT WideVT = ExtVT;
19144 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19145 NumElts *= 512 / ExtVT.getSizeInBits();
19146 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19147 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
19148 In, DAG.getIntPtrConstant(0, DL));
19149 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
19150 NumElts);
19151 }
19152
19153 SDValue One = DAG.getConstant(1, DL, WideVT);
19154 SDValue Zero = DAG.getConstant(0, DL, WideVT);
19155
19156 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
19157
19158 // Truncate if we had to extend above.
19159 if (VT != ExtVT) {
19160 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
19161 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
19162 }
19163
19164 // Extract back to 128/256-bit if we widened.
19165 if (WideVT != VT)
19166 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
19167 DAG.getIntPtrConstant(0, DL));
19168
19169 return SelectedVal;
19170}
19171
19172static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19173 SelectionDAG &DAG) {
19174 SDValue In = Op.getOperand(0);
19175 MVT SVT = In.getSimpleValueType();
19176
19177 if (SVT.getVectorElementType() == MVT::i1)
19178 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
19179
19180 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19180, __PRETTY_FUNCTION__))
;
19181 return LowerAVXExtend(Op, DAG, Subtarget);
19182}
19183
19184/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
19185/// It makes use of the fact that vectors with enough leading sign/zero bits
19186/// prevent the PACKSS/PACKUS from saturating the results.
19187/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
19188/// within each 128-bit lane.
19189static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
19190 const SDLoc &DL, SelectionDAG &DAG,
19191 const X86Subtarget &Subtarget) {
19192 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __PRETTY_FUNCTION__))
19193 "Unexpected PACK opcode")(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19193, __PRETTY_FUNCTION__))
;
19194 assert(DstVT.isVector() && "VT not a vector?")((DstVT.isVector() && "VT not a vector?") ? static_cast
<void> (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19194, __PRETTY_FUNCTION__))
;
19195
19196 // Requires SSE2 but AVX512 has fast vector truncate.
19197 if (!Subtarget.hasSSE2())
19198 return SDValue();
19199
19200 EVT SrcVT = In.getValueType();
19201
19202 // No truncation required, we might get here due to recursive calls.
19203 if (SrcVT == DstVT)
19204 return In;
19205
19206 // We only support vector truncation to 64bits or greater from a
19207 // 128bits or greater source.
19208 unsigned DstSizeInBits = DstVT.getSizeInBits();
19209 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
19210 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
19211 return SDValue();
19212
19213 unsigned NumElems = SrcVT.getVectorNumElements();
19214 if (!isPowerOf2_32(NumElems))
19215 return SDValue();
19216
19217 LLVMContext &Ctx = *DAG.getContext();
19218 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19218, __PRETTY_FUNCTION__))
;
19219 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((SrcSizeInBits > DstSizeInBits && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19219, __PRETTY_FUNCTION__))
;
19220
19221 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
19222
19223 // Pack to the largest type possible:
19224 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
19225 EVT InVT = MVT::i16, OutVT = MVT::i8;
19226 if (SrcVT.getScalarSizeInBits() > 16 &&
19227 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
19228 InVT = MVT::i32;
19229 OutVT = MVT::i16;
19230 }
19231
19232 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
19233 if (SrcVT.is128BitVector()) {
19234 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
19235 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
19236 In = DAG.getBitcast(InVT, In);
19237 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
19238 Res = extractSubVector(Res, 0, DAG, DL, 64);
19239 return DAG.getBitcast(DstVT, Res);
19240 }
19241
19242 // Extract lower/upper subvectors.
19243 unsigned NumSubElts = NumElems / 2;
19244 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
19245 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
19246
19247 unsigned SubSizeInBits = SrcSizeInBits / 2;
19248 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
19249 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
19250
19251 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
19252 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
19253 Lo = DAG.getBitcast(InVT, Lo);
19254 Hi = DAG.getBitcast(InVT, Hi);
19255 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
19256 return DAG.getBitcast(DstVT, Res);
19257 }
19258
19259 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
19260 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
19261 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
19262 Lo = DAG.getBitcast(InVT, Lo);
19263 Hi = DAG.getBitcast(InVT, Hi);
19264 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
19265
19266 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
19267 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
19268 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
19269 SmallVector<int, 64> Mask;
19270 int Scale = 64 / OutVT.getScalarSizeInBits();
19271 scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
19272 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
19273
19274 if (DstVT.is256BitVector())
19275 return DAG.getBitcast(DstVT, Res);
19276
19277 // If 512bit -> 128bit truncate another stage.
19278 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
19279 Res = DAG.getBitcast(PackedVT, Res);
19280 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
19281 }
19282
19283 // Recursively pack lower/upper subvectors, concat result and pack again.
19284 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19284, __PRETTY_FUNCTION__))
;
19285 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
19286 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
19287 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
19288
19289 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
19290 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
19291 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
19292}
19293
19294static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
19295 const X86Subtarget &Subtarget) {
19296
19297 SDLoc DL(Op);
19298 MVT VT = Op.getSimpleValueType();
19299 SDValue In = Op.getOperand(0);
19300 MVT InVT = In.getSimpleValueType();
19301
19302 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19302, __PRETTY_FUNCTION__))
;
19303
19304 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
19305 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
19306 if (InVT.getScalarSizeInBits() <= 16) {
19307 if (Subtarget.hasBWI()) {
19308 // legal, will go to VPMOVB2M, VPMOVW2M
19309 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
19310 // We need to shift to get the lsb into sign position.
19311 // Shift packed bytes not supported natively, bitcast to word
19312 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
19313 In = DAG.getNode(ISD::SHL, DL, ExtVT,
19314 DAG.getBitcast(ExtVT, In),
19315 DAG.getConstant(ShiftInx, DL, ExtVT));
19316 In = DAG.getBitcast(InVT, In);
19317 }
19318 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
19319 In, ISD::SETGT);
19320 }
19321 // Use TESTD/Q, extended vector to packed dword/qword.
19322 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19323, __PRETTY_FUNCTION__))
19323 "Unexpected vector type.")(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19323, __PRETTY_FUNCTION__))
;
19324 unsigned NumElts = InVT.getVectorNumElements();
19325 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19325, __PRETTY_FUNCTION__))
;
19326 // We need to change to a wider element type that we have support for.
19327 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
19328 // For 16 element vectors we extend to v16i32 unless we are explicitly
19329 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
19330 // we need to split into two 8 element vectors which we can extend to v8i32,
19331 // truncate and concat the results. There's an additional complication if
19332 // the original type is v16i8. In that case we can't split the v16i8 so
19333 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
19334 // to v8i32, truncate that to v8i1 and concat the two halves.
19335 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
19336 if (InVT == MVT::v16i8) {
19337 // First we need to sign extend up to 256-bits so we can split that.
19338 InVT = MVT::v16i16;
19339 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
19340 }
19341 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
19342 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
19343 // We're split now, just emit two truncates and a concat. The two
19344 // truncates will trigger legalization to come back to this function.
19345 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
19346 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
19347 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
19348 }
19349 // We either have 8 elements or we're allowed to use 512-bit vectors.
19350 // If we have VLX, we want to use the narrowest vector that can get the
19351 // job done so we use vXi32.
19352 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
19353 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
19354 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
19355 InVT = ExtVT;
19356 ShiftInx = InVT.getScalarSizeInBits() - 1;
19357 }
19358
19359 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
19360 // We need to shift to get the lsb into sign position.
19361 In = DAG.getNode(ISD::SHL, DL, InVT, In,
19362 DAG.getConstant(ShiftInx, DL, InVT));
19363 }
19364 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
19365 if (Subtarget.hasDQI())
19366 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
19367 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
19368}
19369
19370SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
19371 SDLoc DL(Op);
19372 MVT VT = Op.getSimpleValueType();
19373 SDValue In = Op.getOperand(0);
19374 MVT InVT = In.getSimpleValueType();
19375 unsigned InNumEltBits = InVT.getScalarSizeInBits();
19376
19377 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19378, __PRETTY_FUNCTION__))
19378 "Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19378, __PRETTY_FUNCTION__))
;
19379
19380 // If we're called by the type legalizer, handle a few cases.
19381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19382 if (!TLI.isTypeLegal(InVT)) {
19383 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
19384 VT.is128BitVector()) {
19385 assert(Subtarget.hasVLX() && "Unexpected subtarget!")((Subtarget.hasVLX() && "Unexpected subtarget!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVLX() && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19385, __PRETTY_FUNCTION__))
;
19386 // The default behavior is to truncate one step, concatenate, and then
19387 // truncate the remainder. We'd rather produce two 64-bit results and
19388 // concatenate those.
19389 SDValue Lo, Hi;
19390 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
19391
19392 EVT LoVT, HiVT;
19393 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
19394
19395 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
19396 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
19397 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
19398 }
19399
19400 // Otherwise let default legalization handle it.
19401 return SDValue();
19402 }
19403
19404 if (VT.getVectorElementType() == MVT::i1)
19405 return LowerTruncateVecI1(Op, DAG, Subtarget);
19406
19407 // vpmovqb/w/d, vpmovdb/w, vpmovwb
19408 if (Subtarget.hasAVX512()) {
19409 // word to byte only under BWI. Otherwise we have to promoted to v16i32
19410 // and then truncate that. But we should only do that if we haven't been
19411 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
19412 // handled by isel patterns.
19413 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
19414 Subtarget.canExtendTo512DQ())
19415 return Op;
19416 }
19417
19418 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
19419 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
19420
19421 // Truncate with PACKUS if we are truncating a vector with leading zero bits
19422 // that extend all the way to the packed/truncated value.
19423 // Pre-SSE41 we can only use PACKUSWB.
19424 KnownBits Known = DAG.computeKnownBits(In);
19425 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
19426 if (SDValue V =
19427 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
19428 return V;
19429
19430 // Truncate with PACKSS if we are truncating a vector with sign-bits that
19431 // extend all the way to the packed/truncated value.
19432 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
19433 if (SDValue V =
19434 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
19435 return V;
19436
19437 // Handle truncation of V256 to V128 using shuffles.
19438 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((VT.is128BitVector() && InVT.is256BitVector() &&
"Unexpected types!") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19438, __PRETTY_FUNCTION__))
;
19439
19440 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
19441 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
19442 if (Subtarget.hasInt256()) {
19443 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
19444 In = DAG.getBitcast(MVT::v8i32, In);
19445 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
19446 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
19447 DAG.getIntPtrConstant(0, DL));
19448 }
19449
19450 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
19451 DAG.getIntPtrConstant(0, DL));
19452 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
19453 DAG.getIntPtrConstant(2, DL));
19454 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
19455 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
19456 static const int ShufMask[] = {0, 2, 4, 6};
19457 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
19458 }
19459
19460 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
19461 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
19462 if (Subtarget.hasInt256()) {
19463 In = DAG.getBitcast(MVT::v32i8, In);
19464
19465 // The PSHUFB mask:
19466 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
19467 -1, -1, -1, -1, -1, -1, -1, -1,
19468 16, 17, 20, 21, 24, 25, 28, 29,
19469 -1, -1, -1, -1, -1, -1, -1, -1 };
19470 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
19471 In = DAG.getBitcast(MVT::v4i64, In);
19472
19473 static const int ShufMask2[] = {0, 2, -1, -1};
19474 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
19475 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
19476 DAG.getIntPtrConstant(0, DL));
19477 return DAG.getBitcast(VT, In);
19478 }
19479
19480 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
19481 DAG.getIntPtrConstant(0, DL));
19482
19483 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
19484 DAG.getIntPtrConstant(4, DL));
19485
19486 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
19487 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
19488
19489 // The PSHUFB mask:
19490 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
19491 -1, -1, -1, -1, -1, -1, -1, -1};
19492
19493 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
19494 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
19495
19496 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
19497 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
19498
19499 // The MOVLHPS Mask:
19500 static const int ShufMask2[] = {0, 1, 4, 5};
19501 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
19502 return DAG.getBitcast(MVT::v8i16, res);
19503 }
19504
19505 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
19506 // Use an AND to zero uppper bits for PACKUS.
19507 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
19508
19509 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
19510 DAG.getIntPtrConstant(0, DL));
19511 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
19512 DAG.getIntPtrConstant(8, DL));
19513 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
19514 }
19515
19516 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19516)
;
19517}
19518
19519SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
19520 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
19521 MVT VT = Op.getSimpleValueType();
19522 SDValue Src = Op.getOperand(0);
19523 MVT SrcVT = Src.getSimpleValueType();
19524 SDLoc dl(Op);
19525
19526 if (SrcVT == MVT::f128) {
19527 RTLIB::Libcall LC;
19528 if (Op.getOpcode() == ISD::FP_TO_SINT)
19529 LC = RTLIB::getFPTOSINT(SrcVT, VT);
19530 else
19531 LC = RTLIB::getFPTOUINT(SrcVT, VT);
19532
19533 MakeLibCallOptions CallOptions;
19534 return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
19535 }
19536
19537 if (VT.isVector()) {
19538 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
19539 MVT ResVT = MVT::v4i32;
19540 MVT TruncVT = MVT::v4i1;
19541 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
19542 if (!IsSigned && !Subtarget.hasVLX()) {
19543 // Widen to 512-bits.
19544 ResVT = MVT::v8i32;
19545 TruncVT = MVT::v8i1;
19546 Opc = ISD::FP_TO_UINT;
19547 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
19548 DAG.getUNDEF(MVT::v8f64),
19549 Src, DAG.getIntPtrConstant(0, dl));
19550 }
19551 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
19552 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
19553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
19554 DAG.getIntPtrConstant(0, dl));
19555 }
19556
19557 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL!") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19557, __PRETTY_FUNCTION__))
;
19558 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
19559 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
19560 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
19561 DAG.getUNDEF(MVT::v2f32)));
19562 }
19563
19564 return SDValue();
19565 }
19566
19567 assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19567, __PRETTY_FUNCTION__))
;
19568
19569 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
19570
19571 if (!IsSigned && UseSSEReg) {
19572 // Conversions from f32/f64 with AVX512 should be legal.
19573 if (Subtarget.hasAVX512())
19574 return Op;
19575
19576 // Use default expansion for i64.
19577 if (VT == MVT::i64)
19578 return SDValue();
19579
19580 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19580, __PRETTY_FUNCTION__))
;
19581
19582 // Promote i32 to i64 and use a signed operation on 64-bit targets.
19583 if (Subtarget.is64Bit()) {
19584 SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
19585 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19586 }
19587
19588 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
19589 // use fisttp which will be handled later.
19590 if (!Subtarget.hasSSE3())
19591 return SDValue();
19592 }
19593
19594 // Promote i16 to i32 if we can use a SSE operation.
19595 if (VT == MVT::i16 && UseSSEReg) {
19596 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? static_cast<void> (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19596, __PRETTY_FUNCTION__))
;
19597 SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
19598 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19599 }
19600
19601 // If this is a SINT_TO_FP using SSEReg we're done.
19602 if (UseSSEReg && IsSigned)
19603 return Op;
19604
19605 // Fall back to X87.
19606 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
19607 return V;
19608
19609 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19609)
;
19610}
19611
19612SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
19613 SDLoc DL(Op);
19614 MVT VT = Op.getSimpleValueType();
19615 SDValue In = Op.getOperand(0);
19616 MVT SVT = In.getSimpleValueType();
19617
19618 if (VT == MVT::f128) {
19619 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
19620 return LowerF128Call(Op, DAG, LC);
19621 }
19622
19623 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19623, __PRETTY_FUNCTION__))
;
19624
19625 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
19626 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19627 In, DAG.getUNDEF(SVT)));
19628}
19629
19630SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
19631 MVT VT = Op.getSimpleValueType();
19632 SDValue In = Op.getOperand(0);
19633 MVT SVT = In.getSimpleValueType();
19634
19635 // It's legal except when f128 is involved
19636 if (SVT != MVT::f128)
19637 return Op;
19638
19639 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
19640
19641 // FP_ROUND node has a second operand indicating whether it is known to be
19642 // precise. That doesn't take part in the LibCall so we can't directly use
19643 // LowerF128Call.
19644 MakeLibCallOptions CallOptions;
19645 return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
19646}
19647
19648// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
19649// the default expansion of STRICT_FP_ROUND.
19650static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
19651 // FIXME: Need to form a libcall with an input chain for f128.
19652 assert(Op.getOperand(0).getValueType() != MVT::f128 &&((Op.getOperand(0).getValueType() != MVT::f128 && "Don't know how to handle f128 yet!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() != MVT::f128 && \"Don't know how to handle f128 yet!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19653, __PRETTY_FUNCTION__))
19653 "Don't know how to handle f128 yet!")((Op.getOperand(0).getValueType() != MVT::f128 && "Don't know how to handle f128 yet!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() != MVT::f128 && \"Don't know how to handle f128 yet!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19653, __PRETTY_FUNCTION__))
;
19654 return Op;
19655}
19656
19657/// Depending on uarch and/or optimizing for size, we might prefer to use a
19658/// vector operation in place of the typical scalar operation.
19659static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
19660 const X86Subtarget &Subtarget) {
19661 // If both operands have other uses, this is probably not profitable.
19662 SDValue LHS = Op.getOperand(0);
19663 SDValue RHS = Op.getOperand(1);
19664 if (!LHS.hasOneUse() && !RHS.hasOneUse())
19665 return Op;
19666
19667 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
19668 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
19669 if (IsFP && !Subtarget.hasSSE3())
19670 return Op;
19671 if (!IsFP && !Subtarget.hasSSSE3())
19672 return Op;
19673
19674 // Extract from a common vector.
19675 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19676 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19677 LHS.getOperand(0) != RHS.getOperand(0) ||
19678 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
19679 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
19680 !shouldUseHorizontalOp(true, DAG, Subtarget))
19681 return Op;
19682
19683 // Allow commuted 'hadd' ops.
19684 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
19685 unsigned HOpcode;
19686 switch (Op.getOpcode()) {
19687 case ISD::ADD: HOpcode = X86ISD::HADD; break;
19688 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
19689 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
19690 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
19691 default:
19692 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19692)
;
19693 }
19694 unsigned LExtIndex = LHS.getConstantOperandVal(1);
19695 unsigned RExtIndex = RHS.getConstantOperandVal(1);
19696 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
19697 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
19698 std::swap(LExtIndex, RExtIndex);
19699
19700 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
19701 return Op;
19702
19703 SDValue X = LHS.getOperand(0);
19704 EVT VecVT = X.getValueType();
19705 unsigned BitWidth = VecVT.getSizeInBits();
19706 unsigned NumLanes = BitWidth / 128;
19707 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
19708 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19709, __PRETTY_FUNCTION__))
19709 "Not expecting illegal vector widths here")(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19709, __PRETTY_FUNCTION__))
;
19710
19711 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
19712 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
19713 SDLoc DL(Op);
19714 if (BitWidth == 256 || BitWidth == 512) {
19715 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
19716 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
19717 LExtIndex %= NumEltsPerLane;
19718 }
19719
19720 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
19721 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
19722 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
19723 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
19724 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
19725 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
19726 DAG.getIntPtrConstant(LExtIndex / 2, DL));
19727}
19728
19729/// Depending on uarch and/or optimizing for size, we might prefer to use a
19730/// vector operation in place of the typical scalar operation.
19731SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
19732 if (Op.getValueType() == MVT::f128) {
19733 RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
19734 : RTLIB::SUB_F128;
19735 return LowerF128Call(Op, DAG, LC);
19736 }
19737
19738 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19739, __PRETTY_FUNCTION__))
19739 "Only expecting float/double")(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19739, __PRETTY_FUNCTION__))
;
19740 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
19741}
19742
19743/// The only differences between FABS and FNEG are the mask and the logic op.
19744/// FNEG also has a folding opportunity for FNEG(FABS(x)).
19745static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
19746 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19747, __PRETTY_FUNCTION__))
19747 "Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19747, __PRETTY_FUNCTION__))
;
19748
19749 bool IsFABS = (Op.getOpcode() == ISD::FABS);
19750
19751 // If this is a FABS and it has an FNEG user, bail out to fold the combination
19752 // into an FNABS. We'll lower the FABS after that if it is still in use.
19753 if (IsFABS)
19754 for (SDNode *User : Op->uses())
19755 if (User->getOpcode() == ISD::FNEG)
19756 return Op;
19757
19758 SDLoc dl(Op);
19759 MVT VT = Op.getSimpleValueType();
19760
19761 bool IsF128 = (VT == MVT::f128);
19762 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19765, __PRETTY_FUNCTION__))
19763 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19765, __PRETTY_FUNCTION__))
19764 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19765, __PRETTY_FUNCTION__))
19765 "Unexpected type in LowerFABSorFNEG")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19765, __PRETTY_FUNCTION__))
;
19766
19767 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
19768 // decide if we should generate a 16-byte constant mask when we only need 4 or
19769 // 8 bytes for the scalar case.
19770
19771 // There are no scalar bitwise logical SSE/AVX instructions, so we
19772 // generate a 16-byte vector constant and logic op even for the scalar case.
19773 // Using a 16-byte mask allows folding the load of the mask with
19774 // the logic op, so it can save (~4 bytes) on code size.
19775 bool IsFakeVector = !VT.isVector() && !IsF128;
19776 MVT LogicVT = VT;
19777 if (IsFakeVector)
19778 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
19779
19780 unsigned EltBits = VT.getScalarSizeInBits();
19781 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
19782 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
19783 APInt::getSignMask(EltBits);
19784 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
19785 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
19786
19787 SDValue Op0 = Op.getOperand(0);
19788 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
19789 unsigned LogicOp = IsFABS ? X86ISD::FAND :
19790 IsFNABS ? X86ISD::FOR :
19791 X86ISD::FXOR;
19792 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
19793
19794 if (VT.isVector() || IsF128)
19795 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
19796
19797 // For the scalar case extend to a 128-bit vector, perform the logic op,
19798 // and extract the scalar result back out.
19799 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
19800 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
19801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
19802 DAG.getIntPtrConstant(0, dl));
19803}
19804
19805static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
19806 SDValue Mag = Op.getOperand(0);
19807 SDValue Sign = Op.getOperand(1);
19808 SDLoc dl(Op);
19809
19810 // If the sign operand is smaller, extend it first.
19811 MVT VT = Op.getSimpleValueType();
19812 if (Sign.getSimpleValueType().bitsLT(VT))
19813 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
19814
19815 // And if it is bigger, shrink it first.
19816 if (Sign.getSimpleValueType().bitsGT(VT))
19817 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
19818
19819 // At this point the operands and the result should have the same
19820 // type, and that won't be f80 since that is not custom lowered.
19821 bool IsF128 = (VT == MVT::f128);
19822 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19825, __PRETTY_FUNCTION__))
19823 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19825, __PRETTY_FUNCTION__))
19824 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19825, __PRETTY_FUNCTION__))
19825 "Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19825, __PRETTY_FUNCTION__))
;
19826
19827 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
19828
19829 // Perform all scalar logic operations as 16-byte vectors because there are no
19830 // scalar FP logic instructions in SSE.
19831 // TODO: This isn't necessary. If we used scalar types, we might avoid some
19832 // unnecessary splats, but we might miss load folding opportunities. Should
19833 // this decision be based on OptimizeForSize?
19834 bool IsFakeVector = !VT.isVector() && !IsF128;
19835 MVT LogicVT = VT;
19836 if (IsFakeVector)
19837 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
19838
19839 // The mask constants are automatically splatted for vector types.
19840 unsigned EltSizeInBits = VT.getScalarSizeInBits();
19841 SDValue SignMask = DAG.getConstantFP(
19842 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
19843 SDValue MagMask = DAG.getConstantFP(
19844 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
19845
19846 // First, clear all bits but the sign bit from the second operand (sign).
19847 if (IsFakeVector)
19848 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
19849 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
19850
19851 // Next, clear the sign bit from the first operand (magnitude).
19852 // TODO: If we had general constant folding for FP logic ops, this check
19853 // wouldn't be necessary.
19854 SDValue MagBits;
19855 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
19856 APFloat APF = Op0CN->getValueAPF();
19857 APF.clearSign();
19858 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
19859 } else {
19860 // If the magnitude operand wasn't a constant, we need to AND out the sign.
19861 if (IsFakeVector)
19862 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
19863 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
19864 }
19865
19866 // OR the magnitude value with the sign bit.
19867 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
19868 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
19869 DAG.getIntPtrConstant(0, dl));
19870}
19871
19872static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
19873 SDValue N0 = Op.getOperand(0);
19874 SDLoc dl(Op);
19875 MVT VT = Op.getSimpleValueType();
19876
19877 MVT OpVT = N0.getSimpleValueType();
19878 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19879, __PRETTY_FUNCTION__))
19879 "Unexpected type for FGETSIGN")(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19879, __PRETTY_FUNCTION__))
;
19880
19881 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
19882 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
19883 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
19884 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
19885 Res = DAG.getZExtOrTrunc(Res, dl, VT);
19886 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
19887 return Res;
19888}
19889
19890/// Helper for creating a X86ISD::SETCC node.
19891static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
19892 SelectionDAG &DAG) {
19893 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19894 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
19895}
19896
19897/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
19898/// style scalarized (associative) reduction patterns.
19899static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
19900 SmallVectorImpl<SDValue> &SrcOps) {
19901 SmallVector<SDValue, 8> Opnds;
19902 DenseMap<SDValue, APInt> SrcOpMap;
19903 EVT VT = MVT::Other;
19904
19905 // Recognize a special case where a vector is casted into wide integer to
19906 // test all 0s.
19907 assert(Op.getOpcode() == unsigned(BinOp) &&((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19908, __PRETTY_FUNCTION__))
19908 "Unexpected bit reduction opcode")((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19908, __PRETTY_FUNCTION__))
;
19909 Opnds.push_back(Op.getOperand(0));
19910 Opnds.push_back(Op.getOperand(1));
19911
19912 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
19913 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
19914 // BFS traverse all BinOp operands.
19915 if (I->getOpcode() == unsigned(BinOp)) {
19916 Opnds.push_back(I->getOperand(0));
19917 Opnds.push_back(I->getOperand(1));
19918 // Re-evaluate the number of nodes to be traversed.
19919 e += 2; // 2 more nodes (LHS and RHS) are pushed.
19920 continue;
19921 }
19922
19923 // Quit if a non-EXTRACT_VECTOR_ELT
19924 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
19925 return false;
19926
19927 // Quit if without a constant index.
19928 SDValue Idx = I->getOperand(1);
19929 if (!isa<ConstantSDNode>(Idx))
19930 return false;
19931
19932 SDValue Src = I->getOperand(0);
19933 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
19934 if (M == SrcOpMap.end()) {
19935 VT = Src.getValueType();
19936 // Quit if not the same type.
19937 if (SrcOpMap.begin() != SrcOpMap.end() &&
19938 VT != SrcOpMap.begin()->first.getValueType())
19939 return false;
19940 unsigned NumElts = VT.getVectorNumElements();
19941 APInt EltCount = APInt::getNullValue(NumElts);
19942 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
19943 SrcOps.push_back(Src);
19944 }
19945 // Quit if element already used.
19946 unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
19947 if (M->second[CIdx])
19948 return false;
19949 M->second.setBit(CIdx);
19950 }
19951
19952 // Quit if not all elements are used.
19953 for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
19954 E = SrcOpMap.end();
19955 I != E; ++I) {
19956 if (!I->second.isAllOnesValue())
19957 return false;
19958 }
19959
19960 return true;
19961}
19962
19963// Check whether an OR'd tree is PTEST-able.
19964static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
19965 const X86Subtarget &Subtarget,
19966 SelectionDAG &DAG, SDValue &X86CC) {
19967 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree."
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19967, __PRETTY_FUNCTION__))
;
19968
19969 if (!Subtarget.hasSSE41() || !Op->hasOneUse())
19970 return SDValue();
19971
19972 SmallVector<SDValue, 8> VecIns;
19973 if (!matchScalarReduction(Op, ISD::OR, VecIns))
19974 return SDValue();
19975
19976 // Quit if not 128/256-bit vector.
19977 EVT VT = VecIns[0].getValueType();
19978 if (!VT.is128BitVector() && !VT.is256BitVector())
19979 return SDValue();
19980
19981 SDLoc DL(Op);
19982 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
19983
19984 // Cast all vectors into TestVT for PTEST.
19985 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
19986 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
19987
19988 // If more than one full vector is evaluated, OR them first before PTEST.
19989 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
19990 // Each iteration will OR 2 nodes and append the result until there is only
19991 // 1 node left, i.e. the final OR'd value of all vectors.
19992 SDValue LHS = VecIns[Slot];
19993 SDValue RHS = VecIns[Slot + 1];
19994 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
19995 }
19996
19997 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
19998 DL, MVT::i8);
19999 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
20000}
20001
20002/// return true if \c Op has a use that doesn't just read flags.
20003static bool hasNonFlagsUse(SDValue Op) {
20004 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
20005 ++UI) {
20006 SDNode *User = *UI;
20007 unsigned UOpNo = UI.getOperandNo();
20008 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
20009 // Look pass truncate.
20010 UOpNo = User->use_begin().getOperandNo();
20011 User = *User->use_begin();
20012 }
20013
20014 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
20015 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
20016 return true;
20017 }
20018 return false;
20019}
20020
20021/// Emit nodes that will be selected as "test Op0,Op0", or something
20022/// equivalent.
20023static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
20024 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
20025 // CF and OF aren't always set the way we want. Determine which
20026 // of these we need.
20027 bool NeedCF = false;
20028 bool NeedOF = false;
20029 switch (X86CC) {
20030 default: break;
20031 case X86::COND_A: case X86::COND_AE:
20032 case X86::COND_B: case X86::COND_BE:
20033 NeedCF = true;
20034 break;
20035 case X86::COND_G: case X86::COND_GE:
20036 case X86::COND_L: case X86::COND_LE:
20037 case X86::COND_O: case X86::COND_NO: {
20038 // Check if we really need to set the
20039 // Overflow flag. If NoSignedWrap is present
20040 // that is not actually needed.
20041 switch (Op->getOpcode()) {
20042 case ISD::ADD:
20043 case ISD::SUB:
20044 case ISD::MUL:
20045 case ISD::SHL:
20046 if (Op.getNode()->getFlags().hasNoSignedWrap())
20047 break;
20048 LLVM_FALLTHROUGH[[gnu::fallthrough]];
20049 default:
20050 NeedOF = true;
20051 break;
20052 }
20053 break;
20054 }
20055 }
20056 // See if we can use the EFLAGS value from the operand instead of
20057 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
20058 // we prove that the arithmetic won't overflow, we can't use OF or CF.
20059 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
20060 // Emit a CMP with 0, which is the TEST pattern.
20061 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
20062 DAG.getConstant(0, dl, Op.getValueType()));
20063 }
20064 unsigned Opcode = 0;
20065 unsigned NumOperands = 0;
20066
20067 SDValue ArithOp = Op;
20068
20069 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
20070 // which may be the result of a CAST. We use the variable 'Op', which is the
20071 // non-casted variable when we check for possible users.
20072 switch (ArithOp.getOpcode()) {
20073 case ISD::AND:
20074 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
20075 // because a TEST instruction will be better.
20076 if (!hasNonFlagsUse(Op))
20077 break;
20078
20079 LLVM_FALLTHROUGH[[gnu::fallthrough]];
20080 case ISD::ADD:
20081 case ISD::SUB:
20082 case ISD::OR:
20083 case ISD::XOR:
20084 // Transform to an x86-specific ALU node with flags if there is a chance of
20085 // using an RMW op or only the flags are used. Otherwise, leave
20086 // the node alone and emit a 'test' instruction.
20087 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
20088 UE = Op.getNode()->use_end(); UI != UE; ++UI)
20089 if (UI->getOpcode() != ISD::CopyToReg &&
20090 UI->getOpcode() != ISD::SETCC &&
20091 UI->getOpcode() != ISD::STORE)
20092 goto default_case;
20093
20094 // Otherwise use a regular EFLAGS-setting instruction.
20095 switch (ArithOp.getOpcode()) {
20096 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20096)
;
20097 case ISD::ADD: Opcode = X86ISD::ADD; break;
20098 case ISD::SUB: Opcode = X86ISD::SUB; break;
20099 case ISD::XOR: Opcode = X86ISD::XOR; break;
20100 case ISD::AND: Opcode = X86ISD::AND; break;
20101 case ISD::OR: Opcode = X86ISD::OR; break;
20102 }
20103
20104 NumOperands = 2;
20105 break;
20106 case X86ISD::ADD:
20107 case X86ISD::SUB:
20108 case X86ISD::OR:
20109 case X86ISD::XOR:
20110 case X86ISD::AND:
20111 return SDValue(Op.getNode(), 1);
20112 case ISD::SSUBO:
20113 case ISD::USUBO: {
20114 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
20115 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20116 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
20117 Op->getOperand(1)).getValue(1);
20118 }
20119 default:
20120 default_case:
20121 break;
20122 }
20123
20124 if (Opcode == 0) {
20125 // Emit a CMP with 0, which is the TEST pattern.
20126 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
20127 DAG.getConstant(0, dl, Op.getValueType()));
20128 }
20129 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20130 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
20131
20132 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
20133 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
20134 return SDValue(New.getNode(), 1);
20135}
20136
20137/// Emit nodes that will be selected as "cmp Op0,Op1", or something
20138/// equivalent.
20139SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
20140 const SDLoc &dl, SelectionDAG &DAG) const {
20141 if (isNullConstant(Op1))
20142 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
20143
20144 EVT CmpVT = Op0.getValueType();
20145
20146 if (CmpVT.isFloatingPoint())
20147 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
20148
20149 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20150, __PRETTY_FUNCTION__))
20150 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20150, __PRETTY_FUNCTION__))
;
20151
20152 // Only promote the compare up to I32 if it is a 16 bit operation
20153 // with an immediate. 16 bit immediates are to be avoided.
20154 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
20155 !DAG.getMachineFunction().getFunction().hasMinSize()) {
20156 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
20157 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
20158 // Don't do this if the immediate can fit in 8-bits.
20159 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
20160 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
20161 unsigned ExtendOp =
20162 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20163 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
20164 // For equality comparisons try to use SIGN_EXTEND if the input was
20165 // truncate from something with enough sign bits.
20166 if (Op0.getOpcode() == ISD::TRUNCATE) {
20167 SDValue In = Op0.getOperand(0);
20168 unsigned EffBits =
20169 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
20170 if (EffBits <= 16)
20171 ExtendOp = ISD::SIGN_EXTEND;
20172 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
20173 SDValue In = Op1.getOperand(0);
20174 unsigned EffBits =
20175 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
20176 if (EffBits <= 16)
20177 ExtendOp = ISD::SIGN_EXTEND;
20178 }
20179 }
20180
20181 CmpVT = MVT::i32;
20182 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
20183 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
20184 }
20185 }
20186
20187 // Try to shrink i64 compares if the input has enough zero bits.
20188 // FIXME: Do this for non-constant compares for constant on LHS?
20189 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
20190 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
20191 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
20192 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
20193 CmpVT = MVT::i32;
20194 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
20195 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
20196 }
20197
20198 // Use SUB instead of CMP to enable CSE between SUB and CMP.
20199 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
20200 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
20201 return Sub.getValue(1);
20202}
20203
20204/// Convert a comparison if required by the subtarget.
20205SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
20206 SelectionDAG &DAG) const {
20207 // If the subtarget does not support the FUCOMI instruction, floating-point
20208 // comparisons have to be converted.
20209 if (Subtarget.hasCMov() ||
20210 Cmp.getOpcode() != X86ISD::CMP ||
20211 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
20212 !Cmp.getOperand(1).getValueType().isFloatingPoint())
20213 return Cmp;
20214
20215 // The instruction selector will select an FUCOM instruction instead of
20216 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
20217 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
20218 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
20219 SDLoc dl(Cmp);
20220 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
20221 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
20222 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
20223 DAG.getConstant(8, dl, MVT::i8));
20224 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
20225
20226 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
20227 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?")((Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasLAHFSAHF() && \"Target doesn't support SAHF or FCOMI?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20227, __PRETTY_FUNCTION__))
;
20228 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
20229}
20230
20231/// Check if replacement of SQRT with RSQRT should be disabled.
20232bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
20233 EVT VT = Op.getValueType();
20234
20235 // We never want to use both SQRT and RSQRT instructions for the same input.
20236 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
20237 return false;
20238
20239 if (VT.isVector())
20240 return Subtarget.hasFastVectorFSQRT();
20241 return Subtarget.hasFastScalarFSQRT();
20242}
20243
20244/// The minimum architected relative accuracy is 2^-12. We need one
20245/// Newton-Raphson step to have a good float result (24 bits of precision).
20246SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
20247 SelectionDAG &DAG, int Enabled,
20248 int &RefinementSteps,
20249 bool &UseOneConstNR,
20250 bool Reciprocal) const {
20251 EVT VT = Op.getValueType();
20252
20253 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
20254 // It is likely not profitable to do this for f64 because a double-precision
20255 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
20256 // instructions: convert to single, rsqrtss, convert back to double, refine
20257 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
20258 // along with FMA, this could be a throughput win.
20259 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
20260 // after legalize types.
20261 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
20262 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
20263 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
20264 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
20265 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
20266 if (RefinementSteps == ReciprocalEstimate::Unspecified)
20267 RefinementSteps = 1;
20268
20269 UseOneConstNR = false;
20270 // There is no FSQRT for 512-bits, but there is RSQRT14.
20271 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
20272 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
20273 }
20274 return SDValue();
20275}
20276
20277/// The minimum architected relative accuracy is 2^-12. We need one
20278/// Newton-Raphson step to have a good float result (24 bits of precision).
20279SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
20280 int Enabled,
20281 int &RefinementSteps) const {
20282 EVT VT = Op.getValueType();
20283
20284 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
20285 // It is likely not profitable to do this for f64 because a double-precision
20286 // reciprocal estimate with refinement on x86 prior to FMA requires
20287 // 15 instructions: convert to single, rcpss, convert back to double, refine
20288 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
20289 // along with FMA, this could be a throughput win.
20290
20291 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
20292 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
20293 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
20294 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
20295 // Enable estimate codegen with 1 refinement step for vector division.
20296 // Scalar division estimates are disabled because they break too much
20297 // real-world code. These defaults are intended to match GCC behavior.
20298 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
20299 return SDValue();
20300
20301 if (RefinementSteps == ReciprocalEstimate::Unspecified)
20302 RefinementSteps = 1;
20303
20304 // There is no FSQRT for 512-bits, but there is RCP14.
20305 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
20306 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
20307 }
20308 return SDValue();
20309}
20310
20311/// If we have at least two divisions that use the same divisor, convert to
20312/// multiplication by a reciprocal. This may need to be adjusted for a given
20313/// CPU if a division's cost is not at least twice the cost of a multiplication.
20314/// This is because we still need one division to calculate the reciprocal and
20315/// then we need two multiplies by that reciprocal as replacements for the
20316/// original divisions.
20317unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
20318 return 2;
20319}
20320
20321SDValue
20322X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
20323 SelectionDAG &DAG,
20324 SmallVectorImpl<SDNode *> &Created) const {
20325 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
20326 if (isIntDivCheap(N->getValueType(0), Attr))
20327 return SDValue(N,0); // Lower SDIV as SDIV
20328
20329 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20330, __PRETTY_FUNCTION__))
20330 "Unexpected divisor!")(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20330, __PRETTY_FUNCTION__))
;
20331
20332 // Only perform this transform if CMOV is supported otherwise the select
20333 // below will become a branch.
20334 if (!Subtarget.hasCMov())
20335 return SDValue();
20336
20337 // fold (sdiv X, pow2)
20338 EVT VT = N->getValueType(0);
20339 // FIXME: Support i8.
20340 if (VT != MVT::i16 && VT != MVT::i32 &&
20341 !(Subtarget.is64Bit() && VT == MVT::i64))
20342 return SDValue();
20343
20344 unsigned Lg2 = Divisor.countTrailingZeros();
20345
20346 // If the divisor is 2 or -2, the default expansion is better.
20347 if (Lg2 == 1)
20348 return SDValue();
20349
20350 SDLoc DL(N);
20351 SDValue N0 = N->getOperand(0);
20352 SDValue Zero = DAG.getConstant(0, DL, VT);
20353 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
20354 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
20355
20356 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
20357 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
20358 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
20359 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
20360
20361 Created.push_back(Cmp.getNode());
20362 Created.push_back(Add.getNode());
20363 Created.push_back(CMov.getNode());
20364
20365 // Divide by pow2.
20366 SDValue SRA =
20367 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
20368
20369 // If we're dividing by a positive value, we're done. Otherwise, we must
20370 // negate the result.
20371 if (Divisor.isNonNegative())
20372 return SRA;
20373
20374 Created.push_back(SRA.getNode());
20375 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
20376}
20377
20378/// Result of 'and' is compared against zero. Change to a BT node if possible.
20379/// Returns the BT node and the condition code needed to use it.
20380static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
20381 const SDLoc &dl, SelectionDAG &DAG,
20382 SDValue &X86CC) {
20383 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((And.getOpcode() == ISD::AND && "Expected AND node!"
) ? static_cast<void> (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20383, __PRETTY_FUNCTION__))
;
20384 SDValue Op0 = And.getOperand(0);
20385 SDValue Op1 = And.getOperand(1);
20386 if (Op0.getOpcode() == ISD::TRUNCATE)
20387 Op0 = Op0.getOperand(0);
20388 if (Op1.getOpcode() == ISD::TRUNCATE)
20389 Op1 = Op1.getOperand(0);
20390
20391 SDValue Src, BitNo;
20392 if (Op1.getOpcode() == ISD::SHL)
20393 std::swap(Op0, Op1);
20394 if (Op0.getOpcode() == ISD::SHL) {
20395 if (isOneConstant(Op0.getOperand(0))) {
20396 // If we looked past a truncate, check that it's only truncating away
20397 // known zeros.
20398 unsigned BitWidth = Op0.getValueSizeInBits();
20399 unsigned AndBitWidth = And.getValueSizeInBits();
20400 if (BitWidth > AndBitWidth) {
20401 KnownBits Known = DAG.computeKnownBits(Op0);
20402 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
20403 return SDValue();
20404 }
20405 Src = Op1;
20406 BitNo = Op0.getOperand(1);
20407 }
20408 } else if (Op1.getOpcode() == ISD::Constant) {
20409 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
20410 uint64_t AndRHSVal = AndRHS->getZExtValue();
20411 SDValue AndLHS = Op0;
20412
20413 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
20414 Src = AndLHS.getOperand(0);
20415 BitNo = AndLHS.getOperand(1);
20416 } else {
20417 // Use BT if the immediate can't be encoded in a TEST instruction or we
20418 // are optimizing for size and the immedaite won't fit in a byte.
20419 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
20420 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
20421 isPowerOf2_64(AndRHSVal)) {
20422 Src = AndLHS;
20423 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
20424 Src.getValueType());
20425 }
20426 }
20427 }
20428
20429 // No patterns found, give up.
20430 if (!Src.getNode())
20431 return SDValue();
20432
20433 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
20434 // instruction. Since the shift amount is in-range-or-undefined, we know
20435 // that doing a bittest on the i32 value is ok. We extend to i32 because
20436 // the encoding for the i16 version is larger than the i32 version.
20437 // Also promote i16 to i32 for performance / code size reason.
20438 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
20439 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
20440
20441 // See if we can use the 32-bit instruction instead of the 64-bit one for a
20442 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
20443 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
20444 // known to be zero.
20445 if (Src.getValueType() == MVT::i64 &&
20446 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
20447 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
20448
20449 // If the operand types disagree, extend the shift amount to match. Since
20450 // BT ignores high bits (like shifts) we can use anyextend.
20451 if (Src.getValueType() != BitNo.getValueType())
20452 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
20453
20454 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
20455 dl, MVT::i8);
20456 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
20457}
20458
20459/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
20460/// CMPs.
20461static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
20462 SDValue &Op1) {
20463 unsigned SSECC;
20464 bool Swap = false;
20465
20466 // SSE Condition code mapping:
20467 // 0 - EQ
20468 // 1 - LT
20469 // 2 - LE
20470 // 3 - UNORD
20471 // 4 - NEQ
20472 // 5 - NLT
20473 // 6 - NLE
20474 // 7 - ORD
20475 switch (SetCCOpcode) {
20476 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20476)
;
20477 case ISD::SETOEQ:
20478 case ISD::SETEQ: SSECC = 0; break;
20479 case ISD::SETOGT:
20480 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20481 case ISD::SETLT:
20482 case ISD::SETOLT: SSECC = 1; break;
20483 case ISD::SETOGE:
20484 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20485 case ISD::SETLE:
20486 case ISD::SETOLE: SSECC = 2; break;
20487 case ISD::SETUO: SSECC = 3; break;
20488 case ISD::SETUNE:
20489 case ISD::SETNE: SSECC = 4; break;
20490 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20491 case ISD::SETUGE: SSECC = 5; break;
20492 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20493 case ISD::SETUGT: SSECC = 6; break;
20494 case ISD::SETO: SSECC = 7; break;
20495 case ISD::SETUEQ: SSECC = 8; break;
20496 case ISD::SETONE: SSECC = 12; break;
20497 }
20498 if (Swap)
20499 std::swap(Op0, Op1);
20500
20501 return SSECC;
20502}
20503
20504/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
20505/// concatenate the result back.
20506static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
20507 MVT VT = Op.getSimpleValueType();
20508
20509 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20510, __PRETTY_FUNCTION__))
20510 "Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20510, __PRETTY_FUNCTION__))
;
20511
20512 unsigned NumElems = VT.getVectorNumElements();
20513 SDLoc dl(Op);
20514 SDValue CC = Op.getOperand(2);
20515
20516 // Extract the LHS vectors
20517 SDValue LHS = Op.getOperand(0);
20518 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20519 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20520
20521 // Extract the RHS vectors
20522 SDValue RHS = Op.getOperand(1);
20523 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20524 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20525
20526 // Issue the operation on the smaller types and concatenate the result back
20527 MVT EltVT = VT.getVectorElementType();
20528 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20529 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20530 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
20531 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
20532}
20533
20534static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
20535
20536 SDValue Op0 = Op.getOperand(0);
20537 SDValue Op1 = Op.getOperand(1);
20538 SDValue CC = Op.getOperand(2);
20539 MVT VT = Op.getSimpleValueType();
20540 SDLoc dl(Op);
20541
20542 assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20543, __PRETTY_FUNCTION__))
20543 "Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20543, __PRETTY_FUNCTION__))
;
20544
20545 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
20546
20547 // Prefer SETGT over SETLT.
20548 if (SetCCOpcode == ISD::SETLT) {
20549 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
20550 std::swap(Op0, Op1);
20551 }
20552
20553 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
20554}
20555
20556/// Given a buildvector constant, return a new vector constant with each element
20557/// incremented or decremented. If incrementing or decrementing would result in
20558/// unsigned overflow or underflow or this is not a simple vector constant,
20559/// return an empty value.
20560static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
20561 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
20562 if (!BV)
20563 return SDValue();
20564
20565 MVT VT = V.getSimpleValueType();
20566 MVT EltVT = VT.getVectorElementType();
20567 unsigned NumElts = VT.getVectorNumElements();
20568 SmallVector<SDValue, 8> NewVecC;
20569 SDLoc DL(V);
20570 for (unsigned i = 0; i < NumElts; ++i) {
20571 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
20572 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
20573 return SDValue();
20574
20575 // Avoid overflow/underflow.
20576 const APInt &EltC = Elt->getAPIntValue();
20577 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
20578 return SDValue();
20579
20580 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
20581 }
20582
20583 return DAG.getBuildVector(VT, DL, NewVecC);
20584}
20585
20586/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
20587/// Op0 u<= Op1:
20588/// t = psubus Op0, Op1
20589/// pcmpeq t, <0..0>
20590static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
20591 ISD::CondCode Cond, const SDLoc &dl,
20592 const X86Subtarget &Subtarget,
20593 SelectionDAG &DAG) {
20594 if (!Subtarget.hasSSE2())
20595 return SDValue();
20596
20597 MVT VET = VT.getVectorElementType();
20598 if (VET != MVT::i8 && VET != MVT::i16)
20599 return SDValue();
20600
20601 switch (Cond) {
20602 default:
20603 return SDValue();
20604 case ISD::SETULT: {
20605 // If the comparison is against a constant we can turn this into a
20606 // setule. With psubus, setule does not require a swap. This is
20607 // beneficial because the constant in the register is no longer
20608 // destructed as the destination so it can be hoisted out of a loop.
20609 // Only do this pre-AVX since vpcmp* is no longer destructive.
20610 if (Subtarget.hasAVX())
20611 return SDValue();
20612 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
20613 if (!ULEOp1)
20614 return SDValue();
20615 Op1 = ULEOp1;
20616 break;
20617 }
20618 case ISD::SETUGT: {
20619 // If the comparison is against a constant, we can turn this into a setuge.
20620 // This is beneficial because materializing a constant 0 for the PCMPEQ is
20621 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
20622 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
20623 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
20624 if (!UGEOp1)
20625 return SDValue();
20626 Op1 = Op0;
20627 Op0 = UGEOp1;
20628 break;
20629 }
20630 // Psubus is better than flip-sign because it requires no inversion.
20631 case ISD::SETUGE:
20632 std::swap(Op0, Op1);
20633 break;
20634 case ISD::SETULE:
20635 break;
20636 }
20637
20638 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
20639 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
20640 DAG.getConstant(0, dl, VT));
20641}
20642
20643static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
20644 SelectionDAG &DAG) {
20645 SDValue Op0 = Op.getOperand(0);
20646 SDValue Op1 = Op.getOperand(1);
20647 SDValue CC = Op.getOperand(2);
20648 MVT VT = Op.getSimpleValueType();
20649 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
20650 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
20651 SDLoc dl(Op);
20652
20653 if (isFP) {
20654#ifndef NDEBUG
20655 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
20656 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20656, __PRETTY_FUNCTION__))
;
20657#endif
20658
20659 unsigned Opc;
20660 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
20661 assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20661, __PRETTY_FUNCTION__))
;
20662 Opc = X86ISD::CMPM;
20663 } else {
20664 Opc = X86ISD::CMPP;
20665 // The SSE/AVX packed FP comparison nodes are defined with a
20666 // floating-point vector result that matches the operand type. This allows
20667 // them to work with an SSE1 target (integer vector types are not legal).
20668 VT = Op0.getSimpleValueType();
20669 }
20670
20671 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
20672 // emit two comparisons and a logic op to tie them together.
20673 SDValue Cmp;
20674 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
20675 if (SSECC >= 8 && !Subtarget.hasAVX()) {
20676 // LLVM predicate is SETUEQ or SETONE.
20677 unsigned CC0, CC1;
20678 unsigned CombineOpc;
20679 if (Cond == ISD::SETUEQ) {
20680 CC0 = 3; // UNORD
20681 CC1 = 0; // EQ
20682 CombineOpc = X86ISD::FOR;
20683 } else {
20684 assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20684, __PRETTY_FUNCTION__))
;
20685 CC0 = 7; // ORD
20686 CC1 = 4; // NEQ
20687 CombineOpc = X86ISD::FAND;
20688 }
20689
20690 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
20691 DAG.getTargetConstant(CC0, dl, MVT::i8));
20692 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
20693 DAG.getTargetConstant(CC1, dl, MVT::i8));
20694 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
20695 } else {
20696 // Handle all other FP comparisons here.
20697 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
20698 DAG.getTargetConstant(SSECC, dl, MVT::i8));
20699 }
20700
20701 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
20702 // result type of SETCC. The bitcast is expected to be optimized away
20703 // during combining/isel.
20704 if (Opc == X86ISD::CMPP)
20705 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
20706
20707 return Cmp;
20708 }
20709
20710 MVT VTOp0 = Op0.getSimpleValueType();
20711 (void)VTOp0;
20712 assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20713, __PRETTY_FUNCTION__))
20713 "Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20713, __PRETTY_FUNCTION__))
;
20714 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20715, __PRETTY_FUNCTION__))
20715 "Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20715, __PRETTY_FUNCTION__))
;
20716
20717 // The non-AVX512 code below works under the assumption that source and
20718 // destination types are the same.
20719 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20720, __PRETTY_FUNCTION__))
20720 "Value types for source and destination must be the same!")(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20720, __PRETTY_FUNCTION__))
;
20721
20722 // The result is boolean, but operands are int/float
20723 if (VT.getVectorElementType() == MVT::i1) {
20724 // In AVX-512 architecture setcc returns mask with i1 elements,
20725 // But there is no compare instruction for i8 and i16 elements in KNL.
20726 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20727, __PRETTY_FUNCTION__))
20727 "Unexpected operand type")(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20727, __PRETTY_FUNCTION__))
;
20728 return LowerIntVSETCC_AVX512(Op, DAG);
20729 }
20730
20731 // Lower using XOP integer comparisons.
20732 if (VT.is128BitVector() && Subtarget.hasXOP()) {
20733 // Translate compare code to XOP PCOM compare mode.
20734 unsigned CmpMode = 0;
20735 switch (Cond) {
20736 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20736)
;
20737 case ISD::SETULT:
20738 case ISD::SETLT: CmpMode = 0x00; break;
20739 case ISD::SETULE:
20740 case ISD::SETLE: CmpMode = 0x01; break;
20741 case ISD::SETUGT:
20742 case ISD::SETGT: CmpMode = 0x02; break;
20743 case ISD::SETUGE:
20744 case ISD::SETGE: CmpMode = 0x03; break;
20745 case ISD::SETEQ: CmpMode = 0x04; break;
20746 case ISD::SETNE: CmpMode = 0x05; break;
20747 }
20748
20749 // Are we comparing unsigned or signed integers?
20750 unsigned Opc =
20751 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
20752
20753 return DAG.getNode(Opc, dl, VT, Op0, Op1,
20754 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
20755 }
20756
20757 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
20758 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
20759 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
20760 SDValue BC0 = peekThroughBitcasts(Op0);
20761 if (BC0.getOpcode() == ISD::AND) {
20762 APInt UndefElts;
20763 SmallVector<APInt, 64> EltBits;
20764 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
20765 VT.getScalarSizeInBits(), UndefElts,
20766 EltBits, false, false)) {
20767 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
20768 Cond = ISD::SETEQ;
20769 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
20770 }
20771 }
20772 }
20773 }
20774
20775 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
20776 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
20777 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
20778 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
20779 if (C1 && C1->getAPIntValue().isPowerOf2()) {
20780 unsigned BitWidth = VT.getScalarSizeInBits();
20781 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
20782
20783 SDValue Result = Op0.getOperand(0);
20784 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
20785 DAG.getConstant(ShiftAmt, dl, VT));
20786 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
20787 DAG.getConstant(BitWidth - 1, dl, VT));
20788 return Result;
20789 }
20790 }
20791
20792 // Break 256-bit integer vector compare into smaller ones.
20793 if (VT.is256BitVector() && !Subtarget.hasInt256())
20794 return Lower256IntVSETCC(Op, DAG);
20795
20796 // If this is a SETNE against the signed minimum value, change it to SETGT.
20797 // If this is a SETNE against the signed maximum value, change it to SETLT.
20798 // which will be swapped to SETGT.
20799 // Otherwise we use PCMPEQ+invert.
20800 APInt ConstValue;
20801 if (Cond == ISD::SETNE &&
20802 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
20803 if (ConstValue.isMinSignedValue())
20804 Cond = ISD::SETGT;
20805 else if (ConstValue.isMaxSignedValue())
20806 Cond = ISD::SETLT;
20807 }
20808
20809 // If both operands are known non-negative, then an unsigned compare is the
20810 // same as a signed compare and there's no need to flip signbits.
20811 // TODO: We could check for more general simplifications here since we're
20812 // computing known bits.
20813 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
20814 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
20815
20816 // Special case: Use min/max operations for unsigned compares.
20817 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20818 if (ISD::isUnsignedIntSetCC(Cond) &&
20819 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
20820 TLI.isOperationLegal(ISD::UMIN, VT)) {
20821 // If we have a constant operand, increment/decrement it and change the
20822 // condition to avoid an invert.
20823 if (Cond == ISD::SETUGT) {
20824 // X > C --> X >= (C+1) --> X == umax(X, C+1)
20825 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
20826 Op1 = UGTOp1;
20827 Cond = ISD::SETUGE;
20828 }
20829 }
20830 if (Cond == ISD::SETULT) {
20831 // X < C --> X <= (C-1) --> X == umin(X, C-1)
20832 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
20833 Op1 = ULTOp1;
20834 Cond = ISD::SETULE;
20835 }
20836 }
20837 bool Invert = false;
20838 unsigned Opc;
20839 switch (Cond) {
20840 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20840)
;
20841 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20842 case ISD::SETULE: Opc = ISD::UMIN; break;
20843 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
20844 case ISD::SETUGE: Opc = ISD::UMAX; break;
20845 }
20846
20847 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
20848 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
20849
20850 // If the logical-not of the result is required, perform that now.
20851 if (Invert)
20852 Result = DAG.getNOT(dl, Result, VT);
20853
20854 return Result;
20855 }
20856
20857 // Try to use SUBUS and PCMPEQ.
20858 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
20859 return V;
20860
20861 // We are handling one of the integer comparisons here. Since SSE only has
20862 // GT and EQ comparisons for integer, swapping operands and multiple
20863 // operations may be required for some comparisons.
20864 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
20865 : X86ISD::PCMPGT;
20866 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
20867 Cond == ISD::SETGE || Cond == ISD::SETUGE;
20868 bool Invert = Cond == ISD::SETNE ||
20869 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
20870
20871 if (Swap)
20872 std::swap(Op0, Op1);
20873
20874 // Check that the operation in question is available (most are plain SSE2,
20875 // but PCMPGTQ and PCMPEQQ have different requirements).
20876 if (VT == MVT::v2i64) {
20877 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
20878 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20878, __PRETTY_FUNCTION__))
;
20879
20880 // Since SSE has no unsigned integer comparisons, we need to flip the sign
20881 // bits of the inputs before performing those operations. The lower
20882 // compare is always unsigned.
20883 SDValue SB;
20884 if (FlipSigns) {
20885 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
20886 } else {
20887 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
20888 }
20889 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
20890 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
20891
20892 // Cast everything to the right type.
20893 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
20894 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
20895
20896 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
20897 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
20898 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
20899
20900 // Create masks for only the low parts/high parts of the 64 bit integers.
20901 static const int MaskHi[] = { 1, 1, 3, 3 };
20902 static const int MaskLo[] = { 0, 0, 2, 2 };
20903 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
20904 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
20905 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
20906
20907 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
20908 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
20909
20910 if (Invert)
20911 Result = DAG.getNOT(dl, Result, MVT::v4i32);
20912
20913 return DAG.getBitcast(VT, Result);
20914 }
20915
20916 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
20917 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
20918 // pcmpeqd + pshufd + pand.
20919 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20919, __PRETTY_FUNCTION__))
;
20920
20921 // First cast everything to the right type.
20922 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
20923 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
20924
20925 // Do the compare.
20926 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
20927
20928 // Make sure the lower and upper halves are both all-ones.
20929 static const int Mask[] = { 1, 0, 3, 2 };
20930 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
20931 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
20932
20933 if (Invert)
20934 Result = DAG.getNOT(dl, Result, MVT::v4i32);
20935
20936 return DAG.getBitcast(VT, Result);
20937 }
20938 }
20939
20940 // Since SSE has no unsigned integer comparisons, we need to flip the sign
20941 // bits of the inputs before performing those operations.
20942 if (FlipSigns) {
20943 MVT EltVT = VT.getVectorElementType();
20944 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
20945 VT);
20946 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
20947 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
20948 }
20949
20950 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
20951
20952 // If the logical-not of the result is required, perform that now.
20953 if (Invert)
20954 Result = DAG.getNOT(dl, Result, VT);
20955
20956 return Result;
20957}
20958
20959// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
20960static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
20961 const SDLoc &dl, SelectionDAG &DAG,
20962 const X86Subtarget &Subtarget,
20963 SDValue &X86CC) {
20964 // Only support equality comparisons.
20965 if (CC != ISD::SETEQ && CC != ISD::SETNE)
20966 return SDValue();
20967
20968 // Must be a bitcast from vXi1.
20969 if (Op0.getOpcode() != ISD::BITCAST)
20970 return SDValue();
20971
20972 Op0 = Op0.getOperand(0);
20973 MVT VT = Op0.getSimpleValueType();
20974 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
20975 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
20976 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
20977 return SDValue();
20978
20979 X86::CondCode X86Cond;
20980 if (isNullConstant(Op1)) {
20981 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
20982 } else if (isAllOnesConstant(Op1)) {
20983 // C flag is set for all ones.
20984 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
20985 } else
20986 return SDValue();
20987
20988 // If the input is an AND, we can combine it's operands into the KTEST.
20989 bool KTestable = false;
20990 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
20991 KTestable = true;
20992 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
20993 KTestable = true;
20994 if (!isNullConstant(Op1))
20995 KTestable = false;
20996 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
20997 SDValue LHS = Op0.getOperand(0);
20998 SDValue RHS = Op0.getOperand(1);
20999 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
21000 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
21001 }
21002
21003 // If the input is an OR, we can combine it's operands into the KORTEST.
21004 SDValue LHS = Op0;
21005 SDValue RHS = Op0;
21006 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
21007 LHS = Op0.getOperand(0);
21008 RHS = Op0.getOperand(1);
21009 }
21010
21011 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
21012 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
21013}
21014
21015/// Emit flags for the given setcc condition and operands. Also returns the
21016/// corresponding X86 condition code constant in X86CC.
21017SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
21018 ISD::CondCode CC, const SDLoc &dl,
21019 SelectionDAG &DAG,
21020 SDValue &X86CC) const {
21021 // Optimize to BT if possible.
21022 // Lower (X & (1 << N)) == 0 to BT(X, N).
21023 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
21024 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
21025 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
21026 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
21027 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
21028 return BT;
21029 }
21030
21031 // Try to use PTEST for a tree ORs equality compared with 0.
21032 // TODO: We could do AND tree with all 1s as well by using the C flag.
21033 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
21034 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
21035 if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
21036 return PTEST;
21037 }
21038
21039 // Try to lower using KORTEST or KTEST.
21040 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
21041 return Test;
21042
21043 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
21044 // these.
21045 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
21046 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
21047 // If the input is a setcc, then reuse the input setcc or use a new one with
21048 // the inverted condition.
21049 if (Op0.getOpcode() == X86ISD::SETCC) {
21050 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
21051
21052 X86CC = Op0.getOperand(0);
21053 if (Invert) {
21054 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
21055 CCode = X86::GetOppositeBranchCondition(CCode);
21056 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
21057 }
21058
21059 return Op0.getOperand(1);
21060 }
21061 }
21062
21063 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
21064 X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
21065 if (CondCode == X86::COND_INVALID)
21066 return SDValue();
21067
21068 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
21069 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
21070 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
21071 return EFLAGS;
21072}
21073
21074SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
21075
21076 MVT VT = Op.getSimpleValueType();
21077
21078 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
21079
21080 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21080, __PRETTY_FUNCTION__))
;
21081 SDValue Op0 = Op.getOperand(0);
21082 SDValue Op1 = Op.getOperand(1);
21083 SDLoc dl(Op);
21084 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21085
21086 // Handle f128 first, since one possible outcome is a normal integer
21087 // comparison which gets handled by emitFlagsForSetcc.
21088 if (Op0.getValueType() == MVT::f128) {
21089 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
21090
21091 // If softenSetCCOperands returned a scalar, use it.
21092 if (!Op1.getNode()) {
21093 assert(Op0.getValueType() == Op.getValueType() &&((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21094, __PRETTY_FUNCTION__))
21094 "Unexpected setcc expansion!")((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21094, __PRETTY_FUNCTION__))
;
21095 return Op0;
21096 }
21097 }
21098
21099 SDValue X86CC;
21100 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
21101 if (!EFLAGS)
21102 return SDValue();
21103
21104 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
21105}
21106
21107SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
21108 SDValue LHS = Op.getOperand(0);
21109 SDValue RHS = Op.getOperand(1);
21110 SDValue Carry = Op.getOperand(2);
21111 SDValue Cond = Op.getOperand(3);
21112 SDLoc DL(Op);
21113
21114 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."
) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21114, __PRETTY_FUNCTION__))
;
21115 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
21116
21117 // Recreate the carry if needed.
21118 EVT CarryVT = Carry.getValueType();
21119 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
21120 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
21121 Carry, DAG.getConstant(NegOne, DL, CarryVT));
21122
21123 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
21124 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
21125 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
21126}
21127
21128// This function returns three things: the arithmetic computation itself
21129// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
21130// flag and the condition code define the case in which the arithmetic
21131// computation overflows.
21132static std::pair<SDValue, SDValue>
21133getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
21134 assert(Op.getResNo() == 0 && "Unexpected result number!")((Op.getResNo() == 0 && "Unexpected result number!") ?
static_cast<void> (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21134, __PRETTY_FUNCTION__))
;
21135 SDValue Value, Overflow;
21136 SDValue LHS = Op.getOperand(0);
21137 SDValue RHS = Op.getOperand(1);
21138 unsigned BaseOp = 0;
21139 SDLoc DL(Op);
21140 switch (Op.getOpcode()) {
21141 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21141)
;
21142 case ISD::SADDO:
21143 BaseOp = X86ISD::ADD;
21144 Cond = X86::COND_O;
21145 break;
21146 case ISD::UADDO:
21147 BaseOp = X86ISD::ADD;
21148 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
21149 break;
21150 case ISD::SSUBO:
21151 BaseOp = X86ISD::SUB;
21152 Cond = X86::COND_O;
21153 break;
21154 case ISD::USUBO:
21155 BaseOp = X86ISD::SUB;
21156 Cond = X86::COND_B;
21157 break;
21158 case ISD::SMULO:
21159 BaseOp = X86ISD::SMUL;
21160 Cond = X86::COND_O;
21161 break;
21162 case ISD::UMULO:
21163 BaseOp = X86ISD::UMUL;
21164 Cond = X86::COND_O;
21165 break;
21166 }
21167
21168 if (BaseOp) {
21169 // Also sets EFLAGS.
21170 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21171 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
21172 Overflow = Value.getValue(1);
21173 }
21174
21175 return std::make_pair(Value, Overflow);
21176}
21177
21178static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
21179 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
21180 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
21181 // looks for this combo and may remove the "setcc" instruction if the "setcc"
21182 // has only one use.
21183 SDLoc DL(Op);
21184 X86::CondCode Cond;
21185 SDValue Value, Overflow;
21186 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
21187
21188 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
21189 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((Op->getValueType(1) == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21189, __PRETTY_FUNCTION__))
;
21190 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
21191}
21192
21193/// Return true if opcode is a X86 logical comparison.
21194static bool isX86LogicalCmp(SDValue Op) {
21195 unsigned Opc = Op.getOpcode();
21196 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
21197 Opc == X86ISD::SAHF)
21198 return true;
21199 if (Op.getResNo() == 1 &&
21200 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
21201 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
21202 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
21203 return true;
21204
21205 return false;
21206}
21207
21208static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
21209 if (V.getOpcode() != ISD::TRUNCATE)
21210 return false;
21211
21212 SDValue VOp0 = V.getOperand(0);
21213 unsigned InBits = VOp0.getValueSizeInBits();
21214 unsigned Bits = V.getValueSizeInBits();
21215 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
21216}
21217
21218SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
21219 bool AddTest = true;
21220 SDValue Cond = Op.getOperand(0);
21221 SDValue Op1 = Op.getOperand(1);
21222 SDValue Op2 = Op.getOperand(2);
21223 SDLoc DL(Op);
21224 MVT VT = Op1.getSimpleValueType();
21225 SDValue CC;
21226
21227 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
21228 // are available or VBLENDV if AVX is available.
21229 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
21230 if (Cond.getOpcode() == ISD::SETCC &&
21231 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
21232 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
21233 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
21234 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
21235 unsigned SSECC = translateX86FSETCC(
21236 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
21237
21238 if (Subtarget.hasAVX512()) {
21239 SDValue Cmp =
21240 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
21241 DAG.getTargetConstant(SSECC, DL, MVT::i8));
21242 assert(!VT.isVector() && "Not a scalar type?")((!VT.isVector() && "Not a scalar type?") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21242, __PRETTY_FUNCTION__))
;
21243 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
21244 }
21245
21246 if (SSECC < 8 || Subtarget.hasAVX()) {
21247 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
21248 DAG.getTargetConstant(SSECC, DL, MVT::i8));
21249
21250 // If we have AVX, we can use a variable vector select (VBLENDV) instead
21251 // of 3 logic instructions for size savings and potentially speed.
21252 // Unfortunately, there is no scalar form of VBLENDV.
21253
21254 // If either operand is a +0.0 constant, don't try this. We can expect to
21255 // optimize away at least one of the logic instructions later in that
21256 // case, so that sequence would be faster than a variable blend.
21257
21258 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
21259 // uses XMM0 as the selection register. That may need just as many
21260 // instructions as the AND/ANDN/OR sequence due to register moves, so
21261 // don't bother.
21262 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
21263 !isNullFPConstant(Op2)) {
21264 // Convert to vectors, do a VSELECT, and convert back to scalar.
21265 // All of the conversions should be optimized away.
21266 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
21267 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
21268 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
21269 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
21270
21271 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
21272 VCmp = DAG.getBitcast(VCmpVT, VCmp);
21273
21274 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
21275
21276 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21277 VSel, DAG.getIntPtrConstant(0, DL));
21278 }
21279 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
21280 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
21281 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
21282 }
21283 }
21284
21285 // AVX512 fallback is to lower selects of scalar floats to masked moves.
21286 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
21287 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
21288 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
21289 }
21290
21291 // For v64i1 without 64-bit support we need to split and rejoin.
21292 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
21293 assert(Subtarget.hasBWI() && "Expected BWI to be legal")((Subtarget.hasBWI() && "Expected BWI to be legal") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI to be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21293, __PRETTY_FUNCTION__))
;
21294 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
21295 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
21296 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
21297 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
21298 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
21299 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
21300 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21301 }
21302
21303 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
21304 SDValue Op1Scalar;
21305 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
21306 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
21307 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
21308 Op1Scalar = Op1.getOperand(0);
21309 SDValue Op2Scalar;
21310 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
21311 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
21312 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
21313 Op2Scalar = Op2.getOperand(0);
21314 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
21315 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
21316 Op1Scalar, Op2Scalar);
21317 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
21318 return DAG.getBitcast(VT, newSelect);
21319 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
21320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
21321 DAG.getIntPtrConstant(0, DL));
21322 }
21323 }
21324
21325 if (Cond.getOpcode() == ISD::SETCC) {
21326 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
21327 Cond = NewCond;
21328 // If the condition was updated, it's possible that the operands of the
21329 // select were also updated (for example, EmitTest has a RAUW). Refresh
21330 // the local references to the select operands in case they got stale.
21331 Op1 = Op.getOperand(1);
21332 Op2 = Op.getOperand(2);
21333 }
21334 }
21335
21336 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
21337 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
21338 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
21339 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
21340 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
21341 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
21342 if (Cond.getOpcode() == X86ISD::SETCC &&
21343 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
21344 isNullConstant(Cond.getOperand(1).getOperand(1))) {
21345 SDValue Cmp = Cond.getOperand(1);
21346 unsigned CondCode = Cond.getConstantOperandVal(0);
21347
21348 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
21349 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
21350 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
21351 SDValue CmpOp0 = Cmp.getOperand(0);
21352
21353 // Apply further optimizations for special cases
21354 // (select (x != 0), -1, 0) -> neg & sbb
21355 // (select (x == 0), 0, -1) -> neg & sbb
21356 if (isNullConstant(Y) &&
21357 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
21358 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
21359 SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
21360 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21361 Zero = DAG.getConstant(0, DL, Op.getValueType());
21362 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
21363 }
21364
21365 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
21366 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
21367 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
21368
21369 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21370 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
21371 SDValue Res = // Res = 0 or -1.
21372 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
21373
21374 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
21375 Res = DAG.getNOT(DL, Res, Res.getValueType());
21376
21377 if (!isNullConstant(Op2))
21378 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
21379 return Res;
21380 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
21381 Cmp.getOperand(0).getOpcode() == ISD::AND &&
21382 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
21383 SDValue CmpOp0 = Cmp.getOperand(0);
21384 SDValue Src1, Src2;
21385 // true if Op2 is XOR or OR operator and one of its operands
21386 // is equal to Op1
21387 // ( a , a op b) || ( b , a op b)
21388 auto isOrXorPattern = [&]() {
21389 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
21390 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
21391 Src1 =
21392 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
21393 Src2 = Op1;
21394 return true;
21395 }
21396 return false;
21397 };
21398
21399 if (isOrXorPattern()) {
21400 SDValue Neg;
21401 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
21402 // we need mask of all zeros or ones with same size of the other
21403 // operands.
21404 if (CmpSz > VT.getSizeInBits())
21405 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
21406 else if (CmpSz < VT.getSizeInBits())
21407 Neg = DAG.getNode(ISD::AND, DL, VT,
21408 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
21409 DAG.getConstant(1, DL, VT));
21410 else
21411 Neg = CmpOp0;
21412 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
21413 Neg); // -(and (x, 0x1))
21414 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
21415 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
21416 }
21417 }
21418 }
21419
21420 // Look past (and (setcc_carry (cmp ...)), 1).
21421 if (Cond.getOpcode() == ISD::AND &&
21422 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
21423 isOneConstant(Cond.getOperand(1)))
21424 Cond = Cond.getOperand(0);
21425
21426 // If condition flag is set by a X86ISD::CMP, then use it as the condition
21427 // setting operand in place of the X86ISD::SETCC.
21428 unsigned CondOpcode = Cond.getOpcode();
21429 if (CondOpcode == X86ISD::SETCC ||
21430 CondOpcode == X86ISD::SETCC_CARRY) {
21431 CC = Cond.getOperand(0);
21432
21433 SDValue Cmp = Cond.getOperand(1);
21434 bool IllegalFPCMov = false;
21435 if (VT.isFloatingPoint() && !VT.isVector() &&
21436 !isScalarFPTypeInSSEReg(VT)) // FPStack?
21437 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
21438
21439 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
21440 Cmp.getOpcode() == X86ISD::BT) { // FIXME
21441 Cond = Cmp;
21442 AddTest = false;
21443 }
21444 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
21445 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
21446 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
21447 SDValue Value;
21448 X86::CondCode X86Cond;
21449 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
21450
21451 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
21452 AddTest = false;
21453 }
21454
21455 if (AddTest) {
21456 // Look past the truncate if the high bits are known zero.
21457 if (isTruncWithZeroHighBitsInput(Cond, DAG))
21458 Cond = Cond.getOperand(0);
21459
21460 // We know the result of AND is compared against zero. Try to match
21461 // it to BT.
21462 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
21463 SDValue BTCC;
21464 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
21465 CC = BTCC;
21466 Cond = BT;
21467 AddTest = false;
21468 }
21469 }
21470 }
21471
21472 if (AddTest) {
21473 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
21474 Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
21475 X86::COND_NE, DL, DAG);
21476 }
21477
21478 // a < b ? -1 : 0 -> RES = ~setcc_carry
21479 // a < b ? 0 : -1 -> RES = setcc_carry
21480 // a >= b ? -1 : 0 -> RES = setcc_carry
21481 // a >= b ? 0 : -1 -> RES = ~setcc_carry
21482 if (Cond.getOpcode() == X86ISD::SUB) {
21483 Cond = ConvertCmpIfNecessary(Cond, DAG);
21484 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
21485
21486 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
21487 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
21488 (isNullConstant(Op1) || isNullConstant(Op2))) {
21489 SDValue Res =
21490 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
21491 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
21492 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
21493 return DAG.getNOT(DL, Res, Res.getValueType());
21494 return Res;
21495 }
21496 }
21497
21498 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
21499 // widen the cmov and push the truncate through. This avoids introducing a new
21500 // branch during isel and doesn't add any extensions.
21501 if (Op.getValueType() == MVT::i8 &&
21502 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
21503 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
21504 if (T1.getValueType() == T2.getValueType() &&
21505 // Blacklist CopyFromReg to avoid partial register stalls.
21506 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
21507 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
21508 CC, Cond);
21509 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
21510 }
21511 }
21512
21513 // Or finally, promote i8 cmovs if we have CMOV,
21514 // or i16 cmovs if it won't prevent folding a load.
21515 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
21516 // legal, but EmitLoweredSelect() can not deal with these extensions
21517 // being inserted between two CMOV's. (in i16 case too TBN)
21518 // https://bugs.llvm.org/show_bug.cgi?id=40974
21519 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
21520 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
21521 !MayFoldLoad(Op2))) {
21522 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
21523 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
21524 SDValue Ops[] = { Op2, Op1, CC, Cond };
21525 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
21526 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
21527 }
21528
21529 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
21530 // condition is true.
21531 SDValue Ops[] = { Op2, Op1, CC, Cond };
21532 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
21533}
21534
21535static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
21536 const X86Subtarget &Subtarget,
21537 SelectionDAG &DAG) {
21538 MVT VT = Op->getSimpleValueType(0);
21539 SDValue In = Op->getOperand(0);
21540 MVT InVT = In.getSimpleValueType();
21541 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21541, __PRETTY_FUNCTION__))
;
21542 MVT VTElt = VT.getVectorElementType();
21543 SDLoc dl(Op);
21544
21545 unsigned NumElts = VT.getVectorNumElements();
21546
21547 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
21548 MVT ExtVT = VT;
21549 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
21550 // If v16i32 is to be avoided, we'll need to split and concatenate.
21551 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21552 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
21553
21554 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21555 }
21556
21557 // Widen to 512-bits if VLX is not supported.
21558 MVT WideVT = ExtVT;
21559 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21560 NumElts *= 512 / ExtVT.getSizeInBits();
21561 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21562 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
21563 In, DAG.getIntPtrConstant(0, dl));
21564 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21565 }
21566
21567 SDValue V;
21568 MVT WideEltVT = WideVT.getVectorElementType();
21569 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
21570 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
21571 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
21572 } else {
21573 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
21574 SDValue Zero = DAG.getConstant(0, dl, WideVT);
21575 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
21576 }
21577
21578 // Truncate if we had to extend i16/i8 above.
21579 if (VT != ExtVT) {
21580 WideVT = MVT::getVectorVT(VTElt, NumElts);
21581 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
21582 }
21583
21584 // Extract back to 128/256-bit if we widened.
21585 if (WideVT != VT)
21586 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
21587 DAG.getIntPtrConstant(0, dl));
21588
21589 return V;
21590}
21591
21592static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21593 SelectionDAG &DAG) {
21594 SDValue In = Op->getOperand(0);
21595 MVT InVT = In.getSimpleValueType();
21596
21597 if (InVT.getVectorElementType() == MVT::i1)
21598 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
21599
21600 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21600, __PRETTY_FUNCTION__))
;
21601 return LowerAVXExtend(Op, DAG, Subtarget);
21602}
21603
21604// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
21605// For sign extend this needs to handle all vector sizes and SSE4.1 and
21606// non-SSE4.1 targets. For zero extend this should only handle inputs of
21607// MVT::v64i8 when BWI is not supported, but AVX512 is.
21608static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
21609 const X86Subtarget &Subtarget,
21610 SelectionDAG &DAG) {
21611 SDValue In = Op->getOperand(0);
21612 MVT VT = Op->getSimpleValueType(0);
21613 MVT InVT = In.getSimpleValueType();
21614
21615 MVT SVT = VT.getVectorElementType();
21616 MVT InSVT = InVT.getVectorElementType();
21617 assert(SVT.getSizeInBits() > InSVT.getSizeInBits())((SVT.getSizeInBits() > InSVT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21617, __PRETTY_FUNCTION__))
;
21618
21619 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
21620 return SDValue();
21621 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
21622 return SDValue();
21623 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
21624 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
21625 !(VT.is512BitVector() && Subtarget.hasAVX512()))
21626 return SDValue();
21627
21628 SDLoc dl(Op);
21629 unsigned Opc = Op.getOpcode();
21630 unsigned NumElts = VT.getVectorNumElements();
21631
21632 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
21633 // For 512-bit vectors, we need 128-bits or 256-bits.
21634 if (InVT.getSizeInBits() > 128) {
21635 // Input needs to be at least the same number of elements as output, and
21636 // at least 128-bits.
21637 int InSize = InSVT.getSizeInBits() * NumElts;
21638 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
21639 InVT = In.getSimpleValueType();
21640 }
21641
21642 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
21643 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
21644 // need to be handled here for 256/512-bit results.
21645 if (Subtarget.hasInt256()) {
21646 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21646, __PRETTY_FUNCTION__))
;
21647
21648 if (InVT.getVectorNumElements() != NumElts)
21649 return DAG.getNode(Op.getOpcode(), dl, VT, In);
21650
21651 // FIXME: Apparently we create inreg operations that could be regular
21652 // extends.
21653 unsigned ExtOpc =
21654 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
21655 : ISD::ZERO_EXTEND;
21656 return DAG.getNode(ExtOpc, dl, VT, In);
21657 }
21658
21659 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
21660 if (Subtarget.hasAVX()) {
21661 assert(VT.is256BitVector() && "256-bit vector expected")((VT.is256BitVector() && "256-bit vector expected") ?
static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21661, __PRETTY_FUNCTION__))
;
21662 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21663 int HalfNumElts = HalfVT.getVectorNumElements();
21664
21665 unsigned NumSrcElts = InVT.getVectorNumElements();
21666 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
21667 for (int i = 0; i != HalfNumElts; ++i)
21668 HiMask[i] = HalfNumElts + i;
21669
21670 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
21671 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
21672 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
21673 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
21674 }
21675
21676 // We should only get here for sign extend.
21677 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21677, __PRETTY_FUNCTION__))
;
21678 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((VT.is128BitVector() && InVT.is128BitVector() &&
"Unexpected VTs") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21678, __PRETTY_FUNCTION__))
;
21679
21680 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
21681 SDValue Curr = In;
21682 SDValue SignExt = Curr;
21683
21684 // As SRAI is only available on i16/i32 types, we expand only up to i32
21685 // and handle i64 separately.
21686 if (InVT != MVT::v4i32) {
21687 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
21688
21689 unsigned DestWidth = DestVT.getScalarSizeInBits();
21690 unsigned Scale = DestWidth / InSVT.getSizeInBits();
21691
21692 unsigned InNumElts = InVT.getVectorNumElements();
21693 unsigned DestElts = DestVT.getVectorNumElements();
21694
21695 // Build a shuffle mask that takes each input element and places it in the
21696 // MSBs of the new element size.
21697 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
21698 for (unsigned i = 0; i != DestElts; ++i)
21699 Mask[i * Scale + (Scale - 1)] = i;
21700
21701 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
21702 Curr = DAG.getBitcast(DestVT, Curr);
21703
21704 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
21705 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
21706 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
21707 }
21708
21709 if (VT == MVT::v2i64) {
21710 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"
) ? static_cast<void> (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21710, __PRETTY_FUNCTION__))
;
21711 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
21712 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
21713 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
21714 SignExt = DAG.getBitcast(VT, SignExt);
21715 }
21716
21717 return SignExt;
21718}
21719
21720static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21721 SelectionDAG &DAG) {
21722 MVT VT = Op->getSimpleValueType(0);
21723 SDValue In = Op->getOperand(0);
21724 MVT InVT = In.getSimpleValueType();
21725 SDLoc dl(Op);
21726
21727 if (InVT.getVectorElementType() == MVT::i1)
21728 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
21729
21730 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21730, __PRETTY_FUNCTION__))
;
21731 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21732, __PRETTY_FUNCTION__))
21732 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21732, __PRETTY_FUNCTION__))
;
21733 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21736, __PRETTY_FUNCTION__))
21734 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21736, __PRETTY_FUNCTION__))
21735 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21736, __PRETTY_FUNCTION__))
21736 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21736, __PRETTY_FUNCTION__))
;
21737 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21740, __PRETTY_FUNCTION__))
21738 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21740, __PRETTY_FUNCTION__))
21739 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21740, __PRETTY_FUNCTION__))
21740 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21740, __PRETTY_FUNCTION__))
;
21741
21742 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
21743 if (InVT == MVT::v8i8) {
21744 if (VT != MVT::v8i64)
21745 return SDValue();
21746
21747 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
21748 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
21749 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
21750 }
21751
21752 if (Subtarget.hasInt256())
21753 return Op;
21754
21755 // Optimize vectors in AVX mode
21756 // Sign extend v8i16 to v8i32 and
21757 // v4i32 to v4i64
21758 //
21759 // Divide input vector into two parts
21760 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
21761 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
21762 // concat the vectors to original VT
21763 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21764 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
21765
21766 unsigned NumElems = InVT.getVectorNumElements();
21767 SmallVector<int,8> ShufMask(NumElems, -1);
21768 for (unsigned i = 0; i != NumElems/2; ++i)
21769 ShufMask[i] = i + NumElems/2;
21770
21771 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
21772 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
21773
21774 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21775}
21776
21777/// Change a vector store into a pair of half-size vector stores.
21778static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
21779 SDValue StoredVal = Store->getValue();
21780 assert((StoredVal.getValueType().is256BitVector() ||(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21782, __PRETTY_FUNCTION__))
21781 StoredVal.getValueType().is512BitVector()) &&(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21782, __PRETTY_FUNCTION__))
21782 "Expecting 256/512-bit op")(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21782, __PRETTY_FUNCTION__))
;
21783
21784 // Splitting volatile memory ops is not allowed unless the operation was not
21785 // legal to begin with. Assume the input store is legal (this transform is
21786 // only used for targets with AVX). Note: It is possible that we have an
21787 // illegal type like v2i128, and so we could allow splitting a volatile store
21788 // in that case if that is important.
21789 if (!Store->isSimple())
21790 return SDValue();
21791
21792 EVT StoreVT = StoredVal.getValueType();
21793 unsigned NumElems = StoreVT.getVectorNumElements();
21794 unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
21795 unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
21796
21797 SDLoc DL(Store);
21798 SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
21799 SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
21800 SDValue Ptr0 = Store->getBasePtr();
21801 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
21802 unsigned Alignment = Store->getAlignment();
21803 SDValue Ch0 =
21804 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
21805 Alignment, Store->getMemOperand()->getFlags());
21806 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
21807 Store->getPointerInfo().getWithOffset(HalfAlign),
21808 MinAlign(Alignment, HalfAlign),
21809 Store->getMemOperand()->getFlags());
21810 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
21811}
21812
21813/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
21814/// type.
21815static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
21816 SelectionDAG &DAG) {
21817 SDValue StoredVal = Store->getValue();
21818 assert(StoreVT.is128BitVector() &&((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21819, __PRETTY_FUNCTION__))
21819 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21819, __PRETTY_FUNCTION__))
;
21820 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
21821
21822 // Splitting volatile memory ops is not allowed unless the operation was not
21823 // legal to begin with. We are assuming the input op is legal (this transform
21824 // is only used for targets with AVX).
21825 if (!Store->isSimple())
21826 return SDValue();
21827
21828 MVT StoreSVT = StoreVT.getScalarType();
21829 unsigned NumElems = StoreVT.getVectorNumElements();
21830 unsigned ScalarSize = StoreSVT.getStoreSize();
21831 unsigned Alignment = Store->getAlignment();
21832
21833 SDLoc DL(Store);
21834 SmallVector<SDValue, 4> Stores;
21835 for (unsigned i = 0; i != NumElems; ++i) {
21836 unsigned Offset = i * ScalarSize;
21837 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
21838 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
21839 DAG.getIntPtrConstant(i, DL));
21840 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
21841 Store->getPointerInfo().getWithOffset(Offset),
21842 MinAlign(Alignment, Offset),
21843 Store->getMemOperand()->getFlags());
21844 Stores.push_back(Ch);
21845 }
21846 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
21847}
21848
21849static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
21850 SelectionDAG &DAG) {
21851 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
21852 SDLoc dl(St);
21853 SDValue StoredVal = St->getValue();
21854
21855 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
21856 if (StoredVal.getValueType().isVector() &&
21857 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
21858 assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21859, __PRETTY_FUNCTION__))
21859 "Unexpected VT")((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21859, __PRETTY_FUNCTION__))
;
21860 assert(!St->isTruncatingStore() && "Expected non-truncating store")((!St->isTruncatingStore() && "Expected non-truncating store"
) ? static_cast<void> (0) : __assert_fail ("!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21860, __PRETTY_FUNCTION__))
;
21861 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21862, __PRETTY_FUNCTION__))
21862 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21862, __PRETTY_FUNCTION__))
;
21863
21864 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
21865 DAG.getUNDEF(MVT::v16i1), StoredVal,
21866 DAG.getIntPtrConstant(0, dl));
21867 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
21868 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
21869
21870 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
21871 St->getPointerInfo(), St->getAlignment(),
21872 St->getMemOperand()->getFlags());
21873 }
21874
21875 if (St->isTruncatingStore())
21876 return SDValue();
21877
21878 // If this is a 256-bit store of concatenated ops, we are better off splitting
21879 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
21880 // and each half can execute independently. Some cores would split the op into
21881 // halves anyway, so the concat (vinsertf128) is purely an extra op.
21882 MVT StoreVT = StoredVal.getSimpleValueType();
21883 if (StoreVT.is256BitVector()) {
21884 SmallVector<SDValue, 4> CatOps;
21885 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
21886 return splitVectorStore(St, DAG);
21887 return SDValue();
21888 }
21889
21890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21891 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21892, __PRETTY_FUNCTION__))
21892 "Unexpected VT")((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21892, __PRETTY_FUNCTION__))
;
21893 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21894, __PRETTY_FUNCTION__))
21894 TargetLowering::TypeWidenVector && "Unexpected type action!")((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21894, __PRETTY_FUNCTION__))
;
21895
21896 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
21897 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
21898 DAG.getUNDEF(StoreVT));
21899
21900 if (Subtarget.hasSSE2()) {
21901 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
21902 // and store it.
21903 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
21904 MVT CastVT = MVT::getVectorVT(StVT, 2);
21905 StoredVal = DAG.getBitcast(CastVT, StoredVal);
21906 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
21907 DAG.getIntPtrConstant(0, dl));
21908
21909 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
21910 St->getPointerInfo(), St->getAlignment(),
21911 St->getMemOperand()->getFlags());
21912 }
21913 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21913, __PRETTY_FUNCTION__))
;
21914 SDVTList Tys = DAG.getVTList(MVT::Other);
21915 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
21916 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
21917 St->getMemOperand());
21918}
21919
21920// Lower vector extended loads using a shuffle. If SSSE3 is not available we
21921// may emit an illegal shuffle but the expansion is still better than scalar
21922// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
21923// we'll emit a shuffle and a arithmetic shift.
21924// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
21925// TODO: It is possible to support ZExt by zeroing the undef values during
21926// the shuffle phase or after the shuffle.
21927static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
21928 SelectionDAG &DAG) {
21929 MVT RegVT = Op.getSimpleValueType();
21930 assert(RegVT.isVector() && "We only custom lower vector loads.")((RegVT.isVector() && "We only custom lower vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21930, __PRETTY_FUNCTION__))
;
21931 assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21932, __PRETTY_FUNCTION__))
21932 "We only custom lower integer vector loads.")((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21932, __PRETTY_FUNCTION__))
;
21933
21934 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
21935 SDLoc dl(Ld);
21936
21937 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
21938 if (RegVT.getVectorElementType() == MVT::i1) {
21939 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"
) ? static_cast<void> (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21939, __PRETTY_FUNCTION__))
;
21940 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((RegVT.getVectorNumElements() <= 8 && "Unexpected VT"
) ? static_cast<void> (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21940, __PRETTY_FUNCTION__))
;
21941 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21942, __PRETTY_FUNCTION__))
21942 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21942, __PRETTY_FUNCTION__))
;
21943
21944 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
21945 Ld->getPointerInfo(), Ld->getAlignment(),
21946 Ld->getMemOperand()->getFlags());
21947
21948 // Replace chain users with the new chain.
21949 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((NewLd->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21949, __PRETTY_FUNCTION__))
;
21950
21951 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
21952 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
21953 DAG.getBitcast(MVT::v16i1, Val),
21954 DAG.getIntPtrConstant(0, dl));
21955 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
21956 }
21957
21958 return SDValue();
21959}
21960
21961/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
21962/// each of which has no other use apart from the AND / OR.
21963static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
21964 Opc = Op.getOpcode();
21965 if (Opc != ISD::OR && Opc != ISD::AND)
21966 return false;
21967 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
21968 Op.getOperand(0).hasOneUse() &&
21969 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
21970 Op.getOperand(1).hasOneUse());
21971}
21972
21973/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
21974/// SETCC node has a single use.
21975static bool isXor1OfSetCC(SDValue Op) {
21976 if (Op.getOpcode() != ISD::XOR)
21977 return false;
21978 if (isOneConstant(Op.getOperand(1)))
21979 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
21980 Op.getOperand(0).hasOneUse();
21981 return false;
21982}
21983
21984SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
21985 bool addTest = true;
21986 SDValue Chain = Op.getOperand(0);
21987 SDValue Cond = Op.getOperand(1);
21988 SDValue Dest = Op.getOperand(2);
21989 SDLoc dl(Op);
21990 SDValue CC;
21991 bool Inverted = false;
21992
21993 if (Cond.getOpcode() == ISD::SETCC) {
21994 // Check for setcc([su]{add,sub,mul}o == 0).
21995 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
21996 isNullConstant(Cond.getOperand(1)) &&
21997 Cond.getOperand(0).getResNo() == 1 &&
21998 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
21999 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
22000 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
22001 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
22002 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
22003 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
22004 Inverted = true;
22005 Cond = Cond.getOperand(0);
22006 } else {
22007 if (SDValue NewCond = LowerSETCC(Cond, DAG))
22008 Cond = NewCond;
22009 }
22010 }
22011#if 0
22012 // FIXME: LowerXALUO doesn't handle these!!
22013 else if (Cond.getOpcode() == X86ISD::ADD ||
22014 Cond.getOpcode() == X86ISD::SUB ||
22015 Cond.getOpcode() == X86ISD::SMUL ||
22016 Cond.getOpcode() == X86ISD::UMUL)
22017 Cond = LowerXALUO(Cond, DAG);
22018#endif
22019
22020 // Look pass (and (setcc_carry (cmp ...)), 1).
22021 if (Cond.getOpcode() == ISD::AND &&
22022 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
22023 isOneConstant(Cond.getOperand(1)))
22024 Cond = Cond.getOperand(0);
22025
22026 // If condition flag is set by a X86ISD::CMP, then use it as the condition
22027 // setting operand in place of the X86ISD::SETCC.
22028 unsigned CondOpcode = Cond.getOpcode();
22029 if (CondOpcode == X86ISD::SETCC ||
22030 CondOpcode == X86ISD::SETCC_CARRY) {
22031 CC = Cond.getOperand(0);
22032
22033 SDValue Cmp = Cond.getOperand(1);
22034 unsigned Opc = Cmp.getOpcode();
22035 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
22036 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
22037 Cond = Cmp;
22038 addTest = false;
22039 } else {
22040 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
22041 default: break;
22042 case X86::COND_O:
22043 case X86::COND_B:
22044 // These can only come from an arithmetic instruction with overflow,
22045 // e.g. SADDO, UADDO.
22046 Cond = Cond.getOperand(1);
22047 addTest = false;
22048 break;
22049 }
22050 }
22051 }
22052 CondOpcode = Cond.getOpcode();
22053 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
22054 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
22055 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
22056 SDValue Value;
22057 X86::CondCode X86Cond;
22058 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
22059
22060 if (Inverted)
22061 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
22062
22063 CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22064 addTest = false;
22065 } else {
22066 unsigned CondOpc;
22067 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
22068 SDValue Cmp = Cond.getOperand(0).getOperand(1);
22069 if (CondOpc == ISD::OR) {
22070 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
22071 // two branches instead of an explicit OR instruction with a
22072 // separate test.
22073 if (Cmp == Cond.getOperand(1).getOperand(1) &&
22074 isX86LogicalCmp(Cmp)) {
22075 CC = Cond.getOperand(0).getOperand(0);
22076 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
22077 Chain, Dest, CC, Cmp);
22078 CC = Cond.getOperand(1).getOperand(0);
22079 Cond = Cmp;
22080 addTest = false;
22081 }
22082 } else { // ISD::AND
22083 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
22084 // two branches instead of an explicit AND instruction with a
22085 // separate test. However, we only do this if this block doesn't
22086 // have a fall-through edge, because this requires an explicit
22087 // jmp when the condition is false.
22088 if (Cmp == Cond.getOperand(1).getOperand(1) &&
22089 isX86LogicalCmp(Cmp) &&
22090 Op.getNode()->hasOneUse()) {
22091 X86::CondCode CCode0 =
22092 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
22093 CCode0 = X86::GetOppositeBranchCondition(CCode0);
22094 CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
22095 SDNode *User = *Op.getNode()->use_begin();
22096 // Look for an unconditional branch following this conditional branch.
22097 // We need this because we need to reverse the successors in order
22098 // to implement FCMP_OEQ.
22099 if (User->getOpcode() == ISD::BR) {
22100 SDValue FalseBB = User->getOperand(1);
22101 SDNode *NewBR =
22102 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
22103 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22103, __PRETTY_FUNCTION__))
;
22104 (void)NewBR;
22105 Dest = FalseBB;
22106
22107 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
22108 Dest, CC, Cmp);
22109 X86::CondCode CCode1 =
22110 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
22111 CCode1 = X86::GetOppositeBranchCondition(CCode1);
22112 CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
22113 Cond = Cmp;
22114 addTest = false;
22115 }
22116 }
22117 }
22118 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
22119 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
22120 // It should be transformed during dag combiner except when the condition
22121 // is set by a arithmetics with overflow node.
22122 X86::CondCode CCode =
22123 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
22124 CCode = X86::GetOppositeBranchCondition(CCode);
22125 CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
22126 Cond = Cond.getOperand(0).getOperand(1);
22127 addTest = false;
22128 } else if (Cond.getOpcode() == ISD::SETCC &&
22129 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
22130 // For FCMP_OEQ, we can emit
22131 // two branches instead of an explicit AND instruction with a
22132 // separate test. However, we only do this if this block doesn't
22133 // have a fall-through edge, because this requires an explicit
22134 // jmp when the condition is false.
22135 if (Op.getNode()->hasOneUse()) {
22136 SDNode *User = *Op.getNode()->use_begin();
22137 // Look for an unconditional branch following this conditional branch.
22138 // We need this because we need to reverse the successors in order
22139 // to implement FCMP_OEQ.
22140 if (User->getOpcode() == ISD::BR) {
22141 SDValue FalseBB = User->getOperand(1);
22142 SDNode *NewBR =
22143 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
22144 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22144, __PRETTY_FUNCTION__))
;
22145 (void)NewBR;
22146 Dest = FalseBB;
22147
22148 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
22149 Cond.getOperand(0), Cond.getOperand(1));
22150 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
22151 CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
22152 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
22153 Chain, Dest, CC, Cmp);
22154 CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
22155 Cond = Cmp;
22156 addTest = false;
22157 }
22158 }
22159 } else if (Cond.getOpcode() == ISD::SETCC &&
22160 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
22161 // For FCMP_UNE, we can emit
22162 // two branches instead of an explicit OR instruction with a
22163 // separate test.
22164 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
22165 Cond.getOperand(0), Cond.getOperand(1));
22166 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
22167 CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
22168 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
22169 Chain, Dest, CC, Cmp);
22170 CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
22171 Cond = Cmp;
22172 addTest = false;
22173 }
22174 }
22175
22176 if (addTest) {
22177 // Look pass the truncate if the high bits are known zero.
22178 if (isTruncWithZeroHighBitsInput(Cond, DAG))
22179 Cond = Cond.getOperand(0);
22180
22181 // We know the result of AND is compared against zero. Try to match
22182 // it to BT.
22183 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
22184 SDValue BTCC;
22185 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
22186 CC = BTCC;
22187 Cond = BT;
22188 addTest = false;
22189 }
22190 }
22191 }
22192
22193 if (addTest) {
22194 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
22195 CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22196 Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
22197 X86Cond, dl, DAG);
22198 }
22199 Cond = ConvertCmpIfNecessary(Cond, DAG);
22200 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
22201 Chain, Dest, CC, Cond);
22202}
22203
22204// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
22205// Calls to _alloca are needed to probe the stack when allocating more than 4k
22206// bytes in one go. Touching the stack at 4K increments is necessary to ensure
22207// that the guard pages used by the OS virtual memory manager are allocated in
22208// correct sequence.
22209SDValue
22210X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
22211 SelectionDAG &DAG) const {
22212 MachineFunction &MF = DAG.getMachineFunction();
22213 bool SplitStack = MF.shouldSplitStack();
22214 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
22215 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
22216 SplitStack || EmitStackProbe;
22217 SDLoc dl(Op);
22218
22219 // Get the inputs.
22220 SDNode *Node = Op.getNode();
22221 SDValue Chain = Op.getOperand(0);
22222 SDValue Size = Op.getOperand(1);
22223 MaybeAlign Alignment(Op.getConstantOperandVal(2));
22224 EVT VT = Node->getValueType(0);
22225
22226 // Chain the dynamic stack allocation so that it doesn't modify the stack
22227 // pointer when other instructions are using the stack.
22228 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
22229
22230 bool Is64Bit = Subtarget.is64Bit();
22231 MVT SPTy = getPointerTy(DAG.getDataLayout());
22232
22233 SDValue Result;
22234 if (!Lower) {
22235 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22236 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
22237 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __PRETTY_FUNCTION__))
22238 " not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22238, __PRETTY_FUNCTION__))
;
22239
22240 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
22241 Chain = SP.getValue(1);
22242 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22243 const Align StackAlign(TFI.getStackAlignment());
22244 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
22245 if (Alignment && Alignment > StackAlign)
22246 Result =
22247 DAG.getNode(ISD::AND, dl, VT, Result,
22248 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
22249 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
22250 } else if (SplitStack) {
22251 MachineRegisterInfo &MRI = MF.getRegInfo();
22252
22253 if (Is64Bit) {
22254 // The 64 bit implementation of segmented stacks needs to clobber both r10
22255 // r11. This makes it impossible to use it along with nested parameters.
22256 const Function &F = MF.getFunction();
22257 for (const auto &A : F.args()) {
22258 if (A.hasNestAttr())
22259 report_fatal_error("Cannot use segmented stacks with functions that "
22260 "have nested arguments.");
22261 }
22262 }
22263
22264 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
22265 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
22266 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
22267 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
22268 DAG.getRegister(Vreg, SPTy));
22269 } else {
22270 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
22271 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
22272 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
22273
22274 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
22275 Register SPReg = RegInfo->getStackRegister();
22276 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
22277 Chain = SP.getValue(1);
22278
22279 if (Alignment) {
22280 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
22281 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
22282 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
22283 }
22284
22285 Result = SP;
22286 }
22287
22288 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
22289 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
22290
22291 SDValue Ops[2] = {Result, Chain};
22292 return DAG.getMergeValues(Ops, dl);
22293}
22294
22295SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
22296 MachineFunction &MF = DAG.getMachineFunction();
22297 auto PtrVT = getPointerTy(MF.getDataLayout());
22298 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
22299
22300 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
22301 SDLoc DL(Op);
22302
22303 if (!Subtarget.is64Bit() ||
22304 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
22305 // vastart just stores the address of the VarArgsFrameIndex slot into the
22306 // memory location argument.
22307 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
22308 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
22309 MachinePointerInfo(SV));
22310 }
22311
22312 // __va_list_tag:
22313 // gp_offset (0 - 6 * 8)
22314 // fp_offset (48 - 48 + 8 * 16)
22315 // overflow_arg_area (point to parameters coming in memory).
22316 // reg_save_area
22317 SmallVector<SDValue, 8> MemOps;
22318 SDValue FIN = Op.getOperand(1);
22319 // Store gp_offset
22320 SDValue Store = DAG.getStore(
22321 Op.getOperand(0), DL,
22322 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
22323 MachinePointerInfo(SV));
22324 MemOps.push_back(Store);
22325
22326 // Store fp_offset
22327 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
22328 Store = DAG.getStore(
22329 Op.getOperand(0), DL,
22330 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
22331 MachinePointerInfo(SV, 4));
22332 MemOps.push_back(Store);
22333
22334 // Store ptr to overflow_arg_area
22335 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
22336 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
22337 Store =
22338 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
22339 MemOps.push_back(Store);
22340
22341 // Store ptr to reg_save_area.
22342 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
22343 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
22344 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
22345 Store = DAG.getStore(
22346 Op.getOperand(0), DL, RSFIN, FIN,
22347 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
22348 MemOps.push_back(Store);
22349 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
22350}
22351
22352SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
22353 assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22354, __PRETTY_FUNCTION__))
22354 "LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22354, __PRETTY_FUNCTION__))
;
22355 assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail
("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22355, __PRETTY_FUNCTION__))
;
22356
22357 MachineFunction &MF = DAG.getMachineFunction();
22358 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
22359 // The Win64 ABI uses char* instead of a structure.
22360 return DAG.expandVAArg(Op.getNode());
22361
22362 SDValue Chain = Op.getOperand(0);
22363 SDValue SrcPtr = Op.getOperand(1);
22364 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
22365 unsigned Align = Op.getConstantOperandVal(3);
22366 SDLoc dl(Op);
22367
22368 EVT ArgVT = Op.getNode()->getValueType(0);
22369 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22370 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
22371 uint8_t ArgMode;
22372
22373 // Decide which area this value should be read from.
22374 // TODO: Implement the AMD64 ABI in its entirety. This simple
22375 // selection mechanism works only for the basic types.
22376 if (ArgVT == MVT::f80) {
22377 llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22377)
;
22378 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
22379 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
22380 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
22381 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
22382 } else {
22383 llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22383)
;
22384 }
22385
22386 if (ArgMode == 2) {
22387 // Sanity Check: Make sure using fp_offset makes sense.
22388 assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22390, __PRETTY_FUNCTION__))
22389 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22390, __PRETTY_FUNCTION__))
22390 Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22390, __PRETTY_FUNCTION__))
;
22391 }
22392
22393 // Insert VAARG_64 node into the DAG
22394 // VAARG_64 returns two values: Variable Argument Address, Chain
22395 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
22396 DAG.getConstant(ArgMode, dl, MVT::i8),
22397 DAG.getConstant(Align, dl, MVT::i32)};
22398 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
22399 SDValue VAARG = DAG.getMemIntrinsicNode(
22400 X86ISD::VAARG_64, dl,
22401 VTs, InstOps, MVT::i64,
22402 MachinePointerInfo(SV),
22403 /*Align=*/0,
22404 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
22405 Chain = VAARG.getValue(1);
22406
22407 // Load the next argument and return it
22408 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
22409}
22410
22411static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
22412 SelectionDAG &DAG) {
22413 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
22414 // where a va_list is still an i8*.
22415 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22415, __PRETTY_FUNCTION__))
;
22416 if (Subtarget.isCallingConvWin64(
22417 DAG.getMachineFunction().getFunction().getCallingConv()))
22418 // Probably a Win64 va_copy.
22419 return DAG.expandVACopy(Op.getNode());
22420
22421 SDValue Chain = Op.getOperand(0);
22422 SDValue DstPtr = Op.getOperand(1);
22423 SDValue SrcPtr = Op.getOperand(2);
22424 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
22425 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
22426 SDLoc DL(Op);
22427
22428 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
22429 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
22430 false, false,
22431 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
22432}
22433
22434// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
22435static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
22436 switch (Opc) {
22437 case ISD::SHL:
22438 case X86ISD::VSHL:
22439 case X86ISD::VSHLI:
22440 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
22441 case ISD::SRL:
22442 case X86ISD::VSRL:
22443 case X86ISD::VSRLI:
22444 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
22445 case ISD::SRA:
22446 case X86ISD::VSRA:
22447 case X86ISD::VSRAI:
22448 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
22449 }
22450 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22450)
;
22451}
22452
22453/// Handle vector element shifts where the shift amount is a constant.
22454/// Takes immediate version of shift as input.
22455static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
22456 SDValue SrcOp, uint64_t ShiftAmt,
22457 SelectionDAG &DAG) {
22458 MVT ElementType = VT.getVectorElementType();
22459
22460 // Bitcast the source vector to the output type, this is mainly necessary for
22461 // vXi8/vXi64 shifts.
22462 if (VT != SrcOp.getSimpleValueType())
22463 SrcOp = DAG.getBitcast(VT, SrcOp);
22464
22465 // Fold this packed shift into its first operand if ShiftAmt is 0.
22466 if (ShiftAmt == 0)
22467 return SrcOp;
22468
22469 // Check for ShiftAmt >= element width
22470 if (ShiftAmt >= ElementType.getSizeInBits()) {
22471 if (Opc == X86ISD::VSRAI)
22472 ShiftAmt = ElementType.getSizeInBits() - 1;
22473 else
22474 return DAG.getConstant(0, dl, VT);
22475 }
22476
22477 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22478, __PRETTY_FUNCTION__))
22478 && "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22478, __PRETTY_FUNCTION__))
;
22479
22480 // Fold this packed vector shift into a build vector if SrcOp is a
22481 // vector of Constants or UNDEFs.
22482 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
22483 SmallVector<SDValue, 8> Elts;
22484 unsigned NumElts = SrcOp->getNumOperands();
22485
22486 switch (Opc) {
22487 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22487)
;
22488 case X86ISD::VSHLI:
22489 for (unsigned i = 0; i != NumElts; ++i) {
22490 SDValue CurrentOp = SrcOp->getOperand(i);
22491 if (CurrentOp->isUndef()) {
22492 Elts.push_back(CurrentOp);
22493 continue;
22494 }
22495 auto *ND = cast<ConstantSDNode>(CurrentOp);
22496 const APInt &C = ND->getAPIntValue();
22497 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
22498 }
22499 break;
22500 case X86ISD::VSRLI:
22501 for (unsigned i = 0; i != NumElts; ++i) {
22502 SDValue CurrentOp = SrcOp->getOperand(i);
22503 if (CurrentOp->isUndef()) {
22504 Elts.push_back(CurrentOp);
22505 continue;
22506 }
22507 auto *ND = cast<ConstantSDNode>(CurrentOp);
22508 const APInt &C = ND->getAPIntValue();
22509 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
22510 }
22511 break;
22512 case X86ISD::VSRAI:
22513 for (unsigned i = 0; i != NumElts; ++i) {
22514 SDValue CurrentOp = SrcOp->getOperand(i);
22515 if (CurrentOp->isUndef()) {
22516 Elts.push_back(CurrentOp);
22517 continue;
22518 }
22519 auto *ND = cast<ConstantSDNode>(CurrentOp);
22520 const APInt &C = ND->getAPIntValue();
22521 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
22522 }
22523 break;
22524 }
22525
22526 return DAG.getBuildVector(VT, dl, Elts);
22527 }
22528
22529 return DAG.getNode(Opc, dl, VT, SrcOp,
22530 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
22531}
22532
22533/// Handle vector element shifts where the shift amount may or may not be a
22534/// constant. Takes immediate version of shift as input.
22535static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
22536 SDValue SrcOp, SDValue ShAmt,
22537 const X86Subtarget &Subtarget,
22538 SelectionDAG &DAG) {
22539 MVT SVT = ShAmt.getSimpleValueType();
22540 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22540, __PRETTY_FUNCTION__))
;
22541
22542 // Catch shift-by-constant.
22543 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
22544 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
22545 CShAmt->getZExtValue(), DAG);
22546
22547 // Change opcode to non-immediate version.
22548 Opc = getTargetVShiftUniformOpcode(Opc, true);
22549
22550 // Need to build a vector containing shift amount.
22551 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
22552 // +====================+============+=======================================+
22553 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
22554 // +====================+============+=======================================+
22555 // | i64 | Yes, No | Use ShAmt as lowest elt |
22556 // | i32 | Yes | zero-extend in-reg |
22557 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
22558 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
22559 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
22560 // +====================+============+=======================================+
22561
22562 if (SVT == MVT::i64)
22563 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
22564 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
22565 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22566 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
22567 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
22568 ShAmt = ShAmt.getOperand(0);
22569 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
22570 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
22571 if (Subtarget.hasSSE41())
22572 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
22573 MVT::v2i64, ShAmt);
22574 else {
22575 SDValue ByteShift = DAG.getTargetConstant(
22576 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
22577 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
22578 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
22579 ByteShift);
22580 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
22581 ByteShift);
22582 }
22583 } else if (Subtarget.hasSSE41() &&
22584 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22585 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
22586 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
22587 MVT::v2i64, ShAmt);
22588 } else {
22589 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
22590 DAG.getUNDEF(SVT)};
22591 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
22592 }
22593
22594 // The return type has to be a 128-bit type with the same element
22595 // type as the input type.
22596 MVT EltVT = VT.getVectorElementType();
22597 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
22598
22599 ShAmt = DAG.getBitcast(ShVT, ShAmt);
22600 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
22601}
22602
22603/// Return Mask with the necessary casting or extending
22604/// for \p Mask according to \p MaskVT when lowering masking intrinsics
22605static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
22606 const X86Subtarget &Subtarget, SelectionDAG &DAG,
22607 const SDLoc &dl) {
22608
22609 if (isAllOnesConstant(Mask))
22610 return DAG.getConstant(1, dl, MaskVT);
22611 if (X86::isZeroNode(Mask))
22612 return DAG.getConstant(0, dl, MaskVT);
22613
22614 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22614, __PRETTY_FUNCTION__))
;
22615
22616 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
22617 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((MaskVT == MVT::v64i1 && "Expected v64i1 mask!") ? static_cast
<void> (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22617, __PRETTY_FUNCTION__))
;
22618 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22618, __PRETTY_FUNCTION__))
;
22619 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
22620 SDValue Lo, Hi;
22621 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
22622 DAG.getConstant(0, dl, MVT::i32));
22623 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
22624 DAG.getConstant(1, dl, MVT::i32));
22625
22626 Lo = DAG.getBitcast(MVT::v32i1, Lo);
22627 Hi = DAG.getBitcast(MVT::v32i1, Hi);
22628
22629 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
22630 } else {
22631 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
22632 Mask.getSimpleValueType().getSizeInBits());
22633 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
22634 // are extracted by EXTRACT_SUBVECTOR.
22635 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
22636 DAG.getBitcast(BitcastVT, Mask),
22637 DAG.getIntPtrConstant(0, dl));
22638 }
22639}
22640
22641/// Return (and \p Op, \p Mask) for compare instructions or
22642/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
22643/// necessary casting or extending for \p Mask when lowering masking intrinsics
22644static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
22645 SDValue PreservedSrc,
22646 const X86Subtarget &Subtarget,
22647 SelectionDAG &DAG) {
22648 MVT VT = Op.getSimpleValueType();
22649 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22650 unsigned OpcodeSelect = ISD::VSELECT;
22651 SDLoc dl(Op);
22652
22653 if (isAllOnesConstant(Mask))
22654 return Op;
22655
22656 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22657
22658 if (PreservedSrc.isUndef())
22659 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
22660 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
22661}
22662
22663/// Creates an SDNode for a predicated scalar operation.
22664/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
22665/// The mask is coming as MVT::i8 and it should be transformed
22666/// to MVT::v1i1 while lowering masking intrinsics.
22667/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
22668/// "X86select" instead of "vselect". We just can't create the "vselect" node
22669/// for a scalar instruction.
22670static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
22671 SDValue PreservedSrc,
22672 const X86Subtarget &Subtarget,
22673 SelectionDAG &DAG) {
22674
22675 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
22676 if (MaskConst->getZExtValue() & 0x1)
22677 return Op;
22678
22679 MVT VT = Op.getSimpleValueType();
22680 SDLoc dl(Op);
22681
22682 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((Mask.getValueType() == MVT::i8 && "Unexpect type") ?
static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22682, __PRETTY_FUNCTION__))
;
22683 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
22684 DAG.getBitcast(MVT::v8i1, Mask),
22685 DAG.getIntPtrConstant(0, dl));
22686 if (Op.getOpcode() == X86ISD::FSETCCM ||
22687 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
22688 Op.getOpcode() == X86ISD::VFPCLASSS)
22689 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
22690
22691 if (PreservedSrc.isUndef())
22692 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
22693 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
22694}
22695
22696static int getSEHRegistrationNodeSize(const Function *Fn) {
22697 if (!Fn->hasPersonalityFn())
22698 report_fatal_error(
22699 "querying registration node size for function without personality");
22700 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
22701 // WinEHStatePass for the full struct definition.
22702 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
22703 case EHPersonality::MSVC_X86SEH: return 24;
22704 case EHPersonality::MSVC_CXX: return 16;
22705 default: break;
22706 }
22707 report_fatal_error(
22708 "can only recover FP for 32-bit MSVC EH personality functions");
22709}
22710
22711/// When the MSVC runtime transfers control to us, either to an outlined
22712/// function or when returning to a parent frame after catching an exception, we
22713/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
22714/// Here's the math:
22715/// RegNodeBase = EntryEBP - RegNodeSize
22716/// ParentFP = RegNodeBase - ParentFrameOffset
22717/// Subtracting RegNodeSize takes us to the offset of the registration node, and
22718/// subtracting the offset (negative on x86) takes us back to the parent FP.
22719static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
22720 SDValue EntryEBP) {
22721 MachineFunction &MF = DAG.getMachineFunction();
22722 SDLoc dl;
22723
22724 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22725 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
22726
22727 // It's possible that the parent function no longer has a personality function
22728 // if the exceptional code was optimized away, in which case we just return
22729 // the incoming EBP.
22730 if (!Fn->hasPersonalityFn())
22731 return EntryEBP;
22732
22733 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
22734 // registration, or the .set_setframe offset.
22735 MCSymbol *OffsetSym =
22736 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
22737 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
22738 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
22739 SDValue ParentFrameOffset =
22740 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
22741
22742 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
22743 // prologue to RBP in the parent function.
22744 const X86Subtarget &Subtarget =
22745 static_cast<const X86Subtarget &>(DAG.getSubtarget());
22746 if (Subtarget.is64Bit())
22747 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
22748
22749 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
22750 // RegNodeBase = EntryEBP - RegNodeSize
22751 // ParentFP = RegNodeBase - ParentFrameOffset
22752 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
22753 DAG.getConstant(RegNodeSize, dl, PtrVT));
22754 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
22755}
22756
22757SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
22758 SelectionDAG &DAG) const {
22759 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
22760 auto isRoundModeCurDirection = [](SDValue Rnd) {
22761 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
22762 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
22763
22764 return false;
22765 };
22766 auto isRoundModeSAE = [](SDValue Rnd) {
22767 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
22768 unsigned RC = C->getZExtValue();
22769 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
22770 // Clear the NO_EXC bit and check remaining bits.
22771 RC ^= X86::STATIC_ROUNDING::NO_EXC;
22772 // As a convenience we allow no other bits or explicitly
22773 // current direction.
22774 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
22775 }
22776 }
22777
22778 return false;
22779 };
22780 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
22781 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
22782 RC = C->getZExtValue();
22783 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
22784 // Clear the NO_EXC bit and check remaining bits.
22785 RC ^= X86::STATIC_ROUNDING::NO_EXC;
22786 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
22787 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
22788 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
22789 RC == X86::STATIC_ROUNDING::TO_ZERO;
22790 }
22791 }
22792
22793 return false;
22794 };
22795
22796 SDLoc dl(Op);
22797 unsigned IntNo = Op.getConstantOperandVal(0);
22798 MVT VT = Op.getSimpleValueType();
22799 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
22800 if (IntrData) {
22801 switch(IntrData->Type) {
22802 case INTR_TYPE_1OP: {
22803 // We specify 2 possible opcodes for intrinsics with rounding modes.
22804 // First, we check if the intrinsic may have non-default rounding mode,
22805 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
22806 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
22807 if (IntrWithRoundingModeOpcode != 0) {
22808 SDValue Rnd = Op.getOperand(2);
22809 unsigned RC = 0;
22810 if (isRoundModeSAEToX(Rnd, RC))
22811 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
22812 Op.getOperand(1),
22813 DAG.getTargetConstant(RC, dl, MVT::i32));
22814 if (!isRoundModeCurDirection(Rnd))
22815 return SDValue();
22816 }
22817 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
22818 }
22819 case INTR_TYPE_1OP_SAE: {
22820 SDValue Sae = Op.getOperand(2);
22821
22822 unsigned Opc;
22823 if (isRoundModeCurDirection(Sae))
22824 Opc = IntrData->Opc0;
22825 else if (isRoundModeSAE(Sae))
22826 Opc = IntrData->Opc1;
22827 else
22828 return SDValue();
22829
22830 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
22831 }
22832 case INTR_TYPE_2OP: {
22833 SDValue Src2 = Op.getOperand(2);
22834
22835 // We specify 2 possible opcodes for intrinsics with rounding modes.
22836 // First, we check if the intrinsic may have non-default rounding mode,
22837 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
22838 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
22839 if (IntrWithRoundingModeOpcode != 0) {
22840 SDValue Rnd = Op.getOperand(3);
22841 unsigned RC = 0;
22842 if (isRoundModeSAEToX(Rnd, RC))
22843 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
22844 Op.getOperand(1), Src2,
22845 DAG.getTargetConstant(RC, dl, MVT::i32));
22846 if (!isRoundModeCurDirection(Rnd))
22847 return SDValue();
22848 }
22849
22850 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
22851 Op.getOperand(1), Src2);
22852 }
22853 case INTR_TYPE_2OP_SAE: {
22854 SDValue Sae = Op.getOperand(3);
22855
22856 unsigned Opc;
22857 if (isRoundModeCurDirection(Sae))
22858 Opc = IntrData->Opc0;
22859 else if (isRoundModeSAE(Sae))
22860 Opc = IntrData->Opc1;
22861 else
22862 return SDValue();
22863
22864 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
22865 Op.getOperand(2));
22866 }
22867 case INTR_TYPE_3OP:
22868 case INTR_TYPE_3OP_IMM8: {
22869 SDValue Src1 = Op.getOperand(1);
22870 SDValue Src2 = Op.getOperand(2);
22871 SDValue Src3 = Op.getOperand(3);
22872
22873 // We specify 2 possible opcodes for intrinsics with rounding modes.
22874 // First, we check if the intrinsic may have non-default rounding mode,
22875 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
22876 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
22877 if (IntrWithRoundingModeOpcode != 0) {
22878 SDValue Rnd = Op.getOperand(4);
22879 unsigned RC = 0;
22880 if (isRoundModeSAEToX(Rnd, RC))
22881 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
22882 Src1, Src2, Src3,
22883 DAG.getTargetConstant(RC, dl, MVT::i32));
22884 if (!isRoundModeCurDirection(Rnd))
22885 return SDValue();
22886 }
22887
22888 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
22889 Src1, Src2, Src3);
22890 }
22891 case INTR_TYPE_4OP:
22892 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
22893 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
22894 case INTR_TYPE_1OP_MASK: {
22895 SDValue Src = Op.getOperand(1);
22896 SDValue PassThru = Op.getOperand(2);
22897 SDValue Mask = Op.getOperand(3);
22898 // We add rounding mode to the Node when
22899 // - RC Opcode is specified and
22900 // - RC is not "current direction".
22901 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
22902 if (IntrWithRoundingModeOpcode != 0) {
22903 SDValue Rnd = Op.getOperand(4);
22904 unsigned RC = 0;
22905 if (isRoundModeSAEToX(Rnd, RC))
22906 return getVectorMaskingNode(
22907 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
22908 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
22909 Mask, PassThru, Subtarget, DAG);
22910 if (!isRoundModeCurDirection(Rnd))
22911 return SDValue();
22912 }
22913 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
22914 Mask, PassThru, Subtarget, DAG);
22915 }
22916 case INTR_TYPE_1OP_MASK_SAE: {
22917 SDValue Src = Op.getOperand(1);
22918 SDValue PassThru = Op.getOperand(2);
22919 SDValue Mask = Op.getOperand(3);
22920 SDValue Rnd = Op.getOperand(4);
22921
22922 unsigned Opc;
22923 if (isRoundModeCurDirection(Rnd))
22924 Opc = IntrData->Opc0;
22925 else if (isRoundModeSAE(Rnd))
22926 Opc = IntrData->Opc1;
22927 else
22928 return SDValue();
22929
22930 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
22931 Mask, PassThru, Subtarget, DAG);
22932 }
22933 case INTR_TYPE_SCALAR_MASK: {
22934 SDValue Src1 = Op.getOperand(1);
22935 SDValue Src2 = Op.getOperand(2);
22936 SDValue passThru = Op.getOperand(3);
22937 SDValue Mask = Op.getOperand(4);
22938 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
22939 // There are 2 kinds of intrinsics in this group:
22940 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
22941 // (2) With rounding mode and sae - 7 operands.
22942 bool HasRounding = IntrWithRoundingModeOpcode != 0;
22943 if (Op.getNumOperands() == (5U + HasRounding)) {
22944 if (HasRounding) {
22945 SDValue Rnd = Op.getOperand(5);
22946 unsigned RC = 0;
22947 if (isRoundModeSAEToX(Rnd, RC))
22948 return getScalarMaskingNode(
22949 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
22950 DAG.getTargetConstant(RC, dl, MVT::i32)),
22951 Mask, passThru, Subtarget, DAG);
22952 if (!isRoundModeCurDirection(Rnd))
22953 return SDValue();
22954 }
22955 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
22956 Src2),
22957 Mask, passThru, Subtarget, DAG);
22958 }
22959
22960 assert(Op.getNumOperands() == (6U + HasRounding) &&((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22961, __PRETTY_FUNCTION__))
22961 "Unexpected intrinsic form")((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22961, __PRETTY_FUNCTION__))
;
22962 SDValue RoundingMode = Op.getOperand(5);
22963 unsigned Opc = IntrData->Opc0;
22964 if (HasRounding) {
22965 SDValue Sae = Op.getOperand(6);
22966 if (isRoundModeSAE(Sae))
22967 Opc = IntrWithRoundingModeOpcode;
22968 else if (!isRoundModeCurDirection(Sae))
22969 return SDValue();
22970 }
22971 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
22972 Src2, RoundingMode),
22973 Mask, passThru, Subtarget, DAG);
22974 }
22975 case INTR_TYPE_SCALAR_MASK_RND: {
22976 SDValue Src1 = Op.getOperand(1);
22977 SDValue Src2 = Op.getOperand(2);
22978 SDValue passThru = Op.getOperand(3);
22979 SDValue Mask = Op.getOperand(4);
22980 SDValue Rnd = Op.getOperand(5);
22981
22982 SDValue NewOp;
22983 unsigned RC = 0;
22984 if (isRoundModeCurDirection(Rnd))
22985 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
22986 else if (isRoundModeSAEToX(Rnd, RC))
22987 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
22988 DAG.getTargetConstant(RC, dl, MVT::i32));
22989 else
22990 return SDValue();
22991
22992 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
22993 }
22994 case INTR_TYPE_SCALAR_MASK_SAE: {
22995 SDValue Src1 = Op.getOperand(1);
22996 SDValue Src2 = Op.getOperand(2);
22997 SDValue passThru = Op.getOperand(3);
22998 SDValue Mask = Op.getOperand(4);
22999 SDValue Sae = Op.getOperand(5);
23000 unsigned Opc;
23001 if (isRoundModeCurDirection(Sae))
23002 Opc = IntrData->Opc0;
23003 else if (isRoundModeSAE(Sae))
23004 Opc = IntrData->Opc1;
23005 else
23006 return SDValue();
23007
23008 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
23009 Mask, passThru, Subtarget, DAG);
23010 }
23011 case INTR_TYPE_2OP_MASK: {
23012 SDValue Src1 = Op.getOperand(1);
23013 SDValue Src2 = Op.getOperand(2);
23014 SDValue PassThru = Op.getOperand(3);
23015 SDValue Mask = Op.getOperand(4);
23016 SDValue NewOp;
23017 if (IntrData->Opc1 != 0) {
23018 SDValue Rnd = Op.getOperand(5);
23019 unsigned RC = 0;
23020 if (isRoundModeSAEToX(Rnd, RC))
23021 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
23022 DAG.getTargetConstant(RC, dl, MVT::i32));
23023 else if (!isRoundModeCurDirection(Rnd))
23024 return SDValue();
23025 }
23026 if (!NewOp)
23027 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
23028 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
23029 }
23030 case INTR_TYPE_2OP_MASK_SAE: {
23031 SDValue Src1 = Op.getOperand(1);
23032 SDValue Src2 = Op.getOperand(2);
23033 SDValue PassThru = Op.getOperand(3);
23034 SDValue Mask = Op.getOperand(4);
23035
23036 unsigned Opc = IntrData->Opc0;
23037 if (IntrData->Opc1 != 0) {
23038 SDValue Sae = Op.getOperand(5);
23039 if (isRoundModeSAE(Sae))
23040 Opc = IntrData->Opc1;
23041 else if (!isRoundModeCurDirection(Sae))
23042 return SDValue();
23043 }
23044
23045 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
23046 Mask, PassThru, Subtarget, DAG);
23047 }
23048 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
23049 SDValue Src1 = Op.getOperand(1);
23050 SDValue Src2 = Op.getOperand(2);
23051 SDValue Src3 = Op.getOperand(3);
23052 SDValue PassThru = Op.getOperand(4);
23053 SDValue Mask = Op.getOperand(5);
23054 SDValue Sae = Op.getOperand(6);
23055 unsigned Opc;
23056 if (isRoundModeCurDirection(Sae))
23057 Opc = IntrData->Opc0;
23058 else if (isRoundModeSAE(Sae))
23059 Opc = IntrData->Opc1;
23060 else
23061 return SDValue();
23062
23063 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
23064 Mask, PassThru, Subtarget, DAG);
23065 }
23066 case INTR_TYPE_3OP_MASK_SAE: {
23067 SDValue Src1 = Op.getOperand(1);
23068 SDValue Src2 = Op.getOperand(2);
23069 SDValue Src3 = Op.getOperand(3);
23070 SDValue PassThru = Op.getOperand(4);
23071 SDValue Mask = Op.getOperand(5);
23072
23073 unsigned Opc = IntrData->Opc0;
23074 if (IntrData->Opc1 != 0) {
23075 SDValue Sae = Op.getOperand(6);
23076 if (isRoundModeSAE(Sae))
23077 Opc = IntrData->Opc1;
23078 else if (!isRoundModeCurDirection(Sae))
23079 return SDValue();
23080 }
23081 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
23082 Mask, PassThru, Subtarget, DAG);
23083 }
23084 case BLENDV: {
23085 SDValue Src1 = Op.getOperand(1);
23086 SDValue Src2 = Op.getOperand(2);
23087 SDValue Src3 = Op.getOperand(3);
23088
23089 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
23090 Src3 = DAG.getBitcast(MaskVT, Src3);
23091
23092 // Reverse the operands to match VSELECT order.
23093 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
23094 }
23095 case VPERM_2OP : {
23096 SDValue Src1 = Op.getOperand(1);
23097 SDValue Src2 = Op.getOperand(2);
23098
23099 // Swap Src1 and Src2 in the node creation
23100 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
23101 }
23102 case IFMA_OP:
23103 // NOTE: We need to swizzle the operands to pass the multiply operands
23104 // first.
23105 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23106 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
23107 case FPCLASSS: {
23108 SDValue Src1 = Op.getOperand(1);
23109 SDValue Imm = Op.getOperand(2);
23110 SDValue Mask = Op.getOperand(3);
23111 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
23112 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
23113 Subtarget, DAG);
23114 // Need to fill with zeros to ensure the bitcast will produce zeroes
23115 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
23116 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
23117 DAG.getConstant(0, dl, MVT::v8i1),
23118 FPclassMask, DAG.getIntPtrConstant(0, dl));
23119 return DAG.getBitcast(MVT::i8, Ins);
23120 }
23121
23122 case CMP_MASK_CC: {
23123 MVT MaskVT = Op.getSimpleValueType();
23124 SDValue CC = Op.getOperand(3);
23125 // We specify 2 possible opcodes for intrinsics with rounding modes.
23126 // First, we check if the intrinsic may have non-default rounding mode,
23127 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23128 if (IntrData->Opc1 != 0) {
23129 SDValue Sae = Op.getOperand(4);
23130 if (isRoundModeSAE(Sae))
23131 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
23132 Op.getOperand(2), CC, Sae);
23133 if (!isRoundModeCurDirection(Sae))
23134 return SDValue();
23135 }
23136 //default rounding mode
23137 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
23138 Op.getOperand(2), CC);
23139 }
23140 case CMP_MASK_SCALAR_CC: {
23141 SDValue Src1 = Op.getOperand(1);
23142 SDValue Src2 = Op.getOperand(2);
23143 SDValue CC = Op.getOperand(3);
23144 SDValue Mask = Op.getOperand(4);
23145
23146 SDValue Cmp;
23147 if (IntrData->Opc1 != 0) {
23148 SDValue Sae = Op.getOperand(5);
23149 if (isRoundModeSAE(Sae))
23150 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
23151 else if (!isRoundModeCurDirection(Sae))
23152 return SDValue();
23153 }
23154 //default rounding mode
23155 if (!Cmp.getNode())
23156 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
23157
23158 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
23159 Subtarget, DAG);
23160 // Need to fill with zeros to ensure the bitcast will produce zeroes
23161 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
23162 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
23163 DAG.getConstant(0, dl, MVT::v8i1),
23164 CmpMask, DAG.getIntPtrConstant(0, dl));
23165 return DAG.getBitcast(MVT::i8, Ins);
23166 }
23167 case COMI: { // Comparison intrinsics
23168 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
23169 SDValue LHS = Op.getOperand(1);
23170 SDValue RHS = Op.getOperand(2);
23171 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
23172 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
23173 SDValue SetCC;
23174 switch (CC) {
23175 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
23176 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
23177 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
23178 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
23179 break;
23180 }
23181 case ISD::SETNE: { // (ZF = 1 or PF = 1)
23182 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
23183 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
23184 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
23185 break;
23186 }
23187 case ISD::SETGT: // (CF = 0 and ZF = 0)
23188 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
23189 break;
23190 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
23191 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
23192 break;
23193 }
23194 case ISD::SETGE: // CF = 0
23195 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
23196 break;
23197 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
23198 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
23199 break;
23200 default:
23201 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23201)
;
23202 }
23203 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
23204 }
23205 case COMI_RM: { // Comparison intrinsics with Sae
23206 SDValue LHS = Op.getOperand(1);
23207 SDValue RHS = Op.getOperand(2);
23208 unsigned CondVal = Op.getConstantOperandVal(3);
23209 SDValue Sae = Op.getOperand(4);
23210
23211 SDValue FCmp;
23212 if (isRoundModeCurDirection(Sae))
23213 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
23214 DAG.getTargetConstant(CondVal, dl, MVT::i8));
23215 else if (isRoundModeSAE(Sae))
23216 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
23217 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
23218 else
23219 return SDValue();
23220 // Need to fill with zeros to ensure the bitcast will produce zeroes
23221 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
23222 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
23223 DAG.getConstant(0, dl, MVT::v16i1),
23224 FCmp, DAG.getIntPtrConstant(0, dl));
23225 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
23226 DAG.getBitcast(MVT::i16, Ins));
23227 }
23228 case VSHIFT:
23229 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
23230 Op.getOperand(1), Op.getOperand(2), Subtarget,
23231 DAG);
23232 case COMPRESS_EXPAND_IN_REG: {
23233 SDValue Mask = Op.getOperand(3);
23234 SDValue DataToCompress = Op.getOperand(1);
23235 SDValue PassThru = Op.getOperand(2);
23236 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
23237 return Op.getOperand(1);
23238
23239 // Avoid false dependency.
23240 if (PassThru.isUndef())
23241 PassThru = DAG.getConstant(0, dl, VT);
23242
23243 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
23244 Mask);
23245 }
23246 case FIXUPIMM:
23247 case FIXUPIMM_MASKZ: {
23248 SDValue Src1 = Op.getOperand(1);
23249 SDValue Src2 = Op.getOperand(2);
23250 SDValue Src3 = Op.getOperand(3);
23251 SDValue Imm = Op.getOperand(4);
23252 SDValue Mask = Op.getOperand(5);
23253 SDValue Passthru = (IntrData->Type == FIXUPIMM)
23254 ? Src1
23255 : getZeroVector(VT, Subtarget, DAG, dl);
23256
23257 unsigned Opc = IntrData->Opc0;
23258 if (IntrData->Opc1 != 0) {
23259 SDValue Sae = Op.getOperand(6);
23260 if (isRoundModeSAE(Sae))
23261 Opc = IntrData->Opc1;
23262 else if (!isRoundModeCurDirection(Sae))
23263 return SDValue();
23264 }
23265
23266 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
23267
23268 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
23269 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
23270
23271 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
23272 }
23273 case ROUNDP: {
23274 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23274, __PRETTY_FUNCTION__))
;
23275 // Clear the upper bits of the rounding immediate so that the legacy
23276 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
23277 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
23278 SDValue RoundingMode =
23279 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
23280 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23281 Op.getOperand(1), RoundingMode);
23282 }
23283 case ROUNDS: {
23284 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23284, __PRETTY_FUNCTION__))
;
23285 // Clear the upper bits of the rounding immediate so that the legacy
23286 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
23287 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
23288 SDValue RoundingMode =
23289 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
23290 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23291 Op.getOperand(1), Op.getOperand(2), RoundingMode);
23292 }
23293 case BEXTRI: {
23294 assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode")((IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::BEXTR && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23294, __PRETTY_FUNCTION__))
;
23295
23296 // The control is a TargetConstant, but we need to convert it to a
23297 // ConstantSDNode.
23298 uint64_t Imm = Op.getConstantOperandVal(2);
23299 SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
23300 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23301 Op.getOperand(1), Control);
23302 }
23303 // ADC/ADCX/SBB
23304 case ADX: {
23305 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
23306 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
23307
23308 SDValue Res;
23309 // If the carry in is zero, then we should just use ADD/SUB instead of
23310 // ADC/SBB.
23311 if (isNullConstant(Op.getOperand(1))) {
23312 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
23313 Op.getOperand(3));
23314 } else {
23315 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
23316 DAG.getConstant(-1, dl, MVT::i8));
23317 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
23318 Op.getOperand(3), GenCF.getValue(1));
23319 }
23320 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
23321 SDValue Results[] = { SetCC, Res };
23322 return DAG.getMergeValues(Results, dl);
23323 }
23324 case CVTPD2PS_MASK:
23325 case CVTPD2DQ_MASK:
23326 case CVTQQ2PS_MASK:
23327 case TRUNCATE_TO_REG: {
23328 SDValue Src = Op.getOperand(1);
23329 SDValue PassThru = Op.getOperand(2);
23330 SDValue Mask = Op.getOperand(3);
23331
23332 if (isAllOnesConstant(Mask))
23333 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
23334
23335 MVT SrcVT = Src.getSimpleValueType();
23336 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
23337 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23338 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
23339 Mask);
23340 }
23341 case CVTPS2PH_MASK: {
23342 SDValue Src = Op.getOperand(1);
23343 SDValue Rnd = Op.getOperand(2);
23344 SDValue PassThru = Op.getOperand(3);
23345 SDValue Mask = Op.getOperand(4);
23346
23347 if (isAllOnesConstant(Mask))
23348 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
23349
23350 MVT SrcVT = Src.getSimpleValueType();
23351 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
23352 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23353 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
23354 PassThru, Mask);
23355
23356 }
23357 case CVTNEPS2BF16_MASK: {
23358 SDValue Src = Op.getOperand(1);
23359 SDValue PassThru = Op.getOperand(2);
23360 SDValue Mask = Op.getOperand(3);
23361
23362 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
23363 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
23364
23365 // Break false dependency.
23366 if (PassThru.isUndef())
23367 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
23368
23369 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
23370 Mask);
23371 }
23372 default:
23373 break;
23374 }
23375 }
23376
23377 switch (IntNo) {
23378 default: return SDValue(); // Don't custom lower most intrinsics.
23379
23380 // ptest and testp intrinsics. The intrinsic these come from are designed to
23381 // return an integer value, not just an instruction so lower it to the ptest
23382 // or testp pattern and a setcc for the result.
23383 case Intrinsic::x86_avx512_ktestc_b:
23384 case Intrinsic::x86_avx512_ktestc_w:
23385 case Intrinsic::x86_avx512_ktestc_d:
23386 case Intrinsic::x86_avx512_ktestc_q:
23387 case Intrinsic::x86_avx512_ktestz_b:
23388 case Intrinsic::x86_avx512_ktestz_w:
23389 case Intrinsic::x86_avx512_ktestz_d:
23390 case Intrinsic::x86_avx512_ktestz_q:
23391 case Intrinsic::x86_sse41_ptestz:
23392 case Intrinsic::x86_sse41_ptestc:
23393 case Intrinsic::x86_sse41_ptestnzc:
23394 case Intrinsic::x86_avx_ptestz_256:
23395 case Intrinsic::x86_avx_ptestc_256:
23396 case Intrinsic::x86_avx_ptestnzc_256:
23397 case Intrinsic::x86_avx_vtestz_ps:
23398 case Intrinsic::x86_avx_vtestc_ps:
23399 case Intrinsic::x86_avx_vtestnzc_ps:
23400 case Intrinsic::x86_avx_vtestz_pd:
23401 case Intrinsic::x86_avx_vtestc_pd:
23402 case Intrinsic::x86_avx_vtestnzc_pd:
23403 case Intrinsic::x86_avx_vtestz_ps_256:
23404 case Intrinsic::x86_avx_vtestc_ps_256:
23405 case Intrinsic::x86_avx_vtestnzc_ps_256:
23406 case Intrinsic::x86_avx_vtestz_pd_256:
23407 case Intrinsic::x86_avx_vtestc_pd_256:
23408 case Intrinsic::x86_avx_vtestnzc_pd_256: {
23409 unsigned TestOpc = X86ISD::PTEST;
23410 X86::CondCode X86CC;
23411 switch (IntNo) {
23412 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23412)
;
23413 case Intrinsic::x86_avx512_ktestc_b:
23414 case Intrinsic::x86_avx512_ktestc_w:
23415 case Intrinsic::x86_avx512_ktestc_d:
23416 case Intrinsic::x86_avx512_ktestc_q:
23417 // CF = 1
23418 TestOpc = X86ISD::KTEST;
23419 X86CC = X86::COND_B;
23420 break;
23421 case Intrinsic::x86_avx512_ktestz_b:
23422 case Intrinsic::x86_avx512_ktestz_w:
23423 case Intrinsic::x86_avx512_ktestz_d:
23424 case Intrinsic::x86_avx512_ktestz_q:
23425 TestOpc = X86ISD::KTEST;
23426 X86CC = X86::COND_E;
23427 break;
23428 case Intrinsic::x86_avx_vtestz_ps:
23429 case Intrinsic::x86_avx_vtestz_pd:
23430 case Intrinsic::x86_avx_vtestz_ps_256:
23431 case Intrinsic::x86_avx_vtestz_pd_256:
23432 TestOpc = X86ISD::TESTP;
23433 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23434 case Intrinsic::x86_sse41_ptestz:
23435 case Intrinsic::x86_avx_ptestz_256:
23436 // ZF = 1
23437 X86CC = X86::COND_E;
23438 break;
23439 case Intrinsic::x86_avx_vtestc_ps:
23440 case Intrinsic::x86_avx_vtestc_pd:
23441 case Intrinsic::x86_avx_vtestc_ps_256:
23442 case Intrinsic::x86_avx_vtestc_pd_256:
23443 TestOpc = X86ISD::TESTP;
23444 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23445 case Intrinsic::x86_sse41_ptestc:
23446 case Intrinsic::x86_avx_ptestc_256:
23447 // CF = 1
23448 X86CC = X86::COND_B;
23449 break;
23450 case Intrinsic::x86_avx_vtestnzc_ps:
23451 case Intrinsic::x86_avx_vtestnzc_pd:
23452 case Intrinsic::x86_avx_vtestnzc_ps_256:
23453 case Intrinsic::x86_avx_vtestnzc_pd_256:
23454 TestOpc = X86ISD::TESTP;
23455 LLVM_FALLTHROUGH[[gnu::fallthrough]];
23456 case Intrinsic::x86_sse41_ptestnzc:
23457 case Intrinsic::x86_avx_ptestnzc_256:
23458 // ZF and CF = 0
23459 X86CC = X86::COND_A;
23460 break;
23461 }
23462
23463 SDValue LHS = Op.getOperand(1);
23464 SDValue RHS = Op.getOperand(2);
23465 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
23466 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
23467 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
23468 }
23469
23470 case Intrinsic::x86_sse42_pcmpistria128:
23471 case Intrinsic::x86_sse42_pcmpestria128:
23472 case Intrinsic::x86_sse42_pcmpistric128:
23473 case Intrinsic::x86_sse42_pcmpestric128:
23474 case Intrinsic::x86_sse42_pcmpistrio128:
23475 case Intrinsic::x86_sse42_pcmpestrio128:
23476 case Intrinsic::x86_sse42_pcmpistris128:
23477 case Intrinsic::x86_sse42_pcmpestris128:
23478 case Intrinsic::x86_sse42_pcmpistriz128:
23479 case Intrinsic::x86_sse42_pcmpestriz128: {
23480 unsigned Opcode;
23481 X86::CondCode X86CC;
23482 switch (IntNo) {
23483 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23483)
; // Can't reach here.
23484 case Intrinsic::x86_sse42_pcmpistria128:
23485 Opcode = X86ISD::PCMPISTR;
23486 X86CC = X86::COND_A;
23487 break;
23488 case Intrinsic::x86_sse42_pcmpestria128:
23489 Opcode = X86ISD::PCMPESTR;
23490 X86CC = X86::COND_A;
23491 break;
23492 case Intrinsic::x86_sse42_pcmpistric128:
23493 Opcode = X86ISD::PCMPISTR;
23494 X86CC = X86::COND_B;
23495 break;
23496 case Intrinsic::x86_sse42_pcmpestric128:
23497 Opcode = X86ISD::PCMPESTR;
23498 X86CC = X86::COND_B;
23499 break;
23500 case Intrinsic::x86_sse42_pcmpistrio128:
23501 Opcode = X86ISD::PCMPISTR;
23502 X86CC = X86::COND_O;
23503 break;
23504 case Intrinsic::x86_sse42_pcmpestrio128:
23505 Opcode = X86ISD::PCMPESTR;
23506 X86CC = X86::COND_O;
23507 break;
23508 case Intrinsic::x86_sse42_pcmpistris128:
23509 Opcode = X86ISD::PCMPISTR;
23510 X86CC = X86::COND_S;
23511 break;
23512 case Intrinsic::x86_sse42_pcmpestris128:
23513 Opcode = X86ISD::PCMPESTR;
23514 X86CC = X86::COND_S;
23515 break;
23516 case Intrinsic::x86_sse42_pcmpistriz128:
23517 Opcode = X86ISD::PCMPISTR;
23518 X86CC = X86::COND_E;
23519 break;
23520 case Intrinsic::x86_sse42_pcmpestriz128:
23521 Opcode = X86ISD::PCMPESTR;
23522 X86CC = X86::COND_E;
23523 break;
23524 }
23525 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
23526 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
23527 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
23528 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
23529 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
23530 }
23531
23532 case Intrinsic::x86_sse42_pcmpistri128:
23533 case Intrinsic::x86_sse42_pcmpestri128: {
23534 unsigned Opcode;
23535 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
23536 Opcode = X86ISD::PCMPISTR;
23537 else
23538 Opcode = X86ISD::PCMPESTR;
23539
23540 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
23541 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
23542 return DAG.getNode(Opcode, dl, VTs, NewOps);
23543 }
23544
23545 case Intrinsic::x86_sse42_pcmpistrm128:
23546 case Intrinsic::x86_sse42_pcmpestrm128: {
23547 unsigned Opcode;
23548 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
23549 Opcode = X86ISD::PCMPISTR;
23550 else
23551 Opcode = X86ISD::PCMPESTR;
23552
23553 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
23554 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
23555 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
23556 }
23557
23558 case Intrinsic::eh_sjlj_lsda: {
23559 MachineFunction &MF = DAG.getMachineFunction();
23560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23561 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23562 auto &Context = MF.getMMI().getContext();
23563 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
23564 Twine(MF.getFunctionNumber()));
23565 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
23566 DAG.getMCSymbol(S, PtrVT));
23567 }
23568
23569 case Intrinsic::x86_seh_lsda: {
23570 // Compute the symbol for the LSDA. We know it'll get emitted later.
23571 MachineFunction &MF = DAG.getMachineFunction();
23572 SDValue Op1 = Op.getOperand(1);
23573 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
23574 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
23575 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
23576
23577 // Generate a simple absolute symbol reference. This intrinsic is only
23578 // supported on 32-bit Windows, which isn't PIC.
23579 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
23580 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
23581 }
23582
23583 case Intrinsic::eh_recoverfp: {
23584 SDValue FnOp = Op.getOperand(1);
23585 SDValue IncomingFPOp = Op.getOperand(2);
23586 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
23587 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
23588 if (!Fn)
23589 report_fatal_error(
23590 "llvm.eh.recoverfp must take a function as the first argument");
23591 return recoverFramePointer(DAG, Fn, IncomingFPOp);
23592 }
23593
23594 case Intrinsic::localaddress: {
23595 // Returns one of the stack, base, or frame pointer registers, depending on
23596 // which is used to reference local variables.
23597 MachineFunction &MF = DAG.getMachineFunction();
23598 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23599 unsigned Reg;
23600 if (RegInfo->hasBasePointer(MF))
23601 Reg = RegInfo->getBaseRegister();
23602 else { // Handles the SP or FP case.
23603 bool CantUseFP = RegInfo->needsStackRealignment(MF);
23604 if (CantUseFP)
23605 Reg = RegInfo->getPtrSizedStackRegister(MF);
23606 else
23607 Reg = RegInfo->getPtrSizedFrameRegister(MF);
23608 }
23609 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
23610 }
23611
23612 case Intrinsic::x86_avx512_vp2intersect_q_512:
23613 case Intrinsic::x86_avx512_vp2intersect_q_256:
23614 case Intrinsic::x86_avx512_vp2intersect_q_128:
23615 case Intrinsic::x86_avx512_vp2intersect_d_512:
23616 case Intrinsic::x86_avx512_vp2intersect_d_256:
23617 case Intrinsic::x86_avx512_vp2intersect_d_128: {
23618 MVT MaskVT = Op.getSimpleValueType();
23619
23620 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
23621 SDLoc DL(Op);
23622
23623 SDValue Operation =
23624 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
23625 Op->getOperand(1), Op->getOperand(2));
23626
23627 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
23628 MaskVT, Operation);
23629 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
23630 MaskVT, Operation);
23631 return DAG.getMergeValues({Result0, Result1}, DL);
23632 }
23633 case Intrinsic::x86_mmx_pslli_w:
23634 case Intrinsic::x86_mmx_pslli_d:
23635 case Intrinsic::x86_mmx_pslli_q:
23636 case Intrinsic::x86_mmx_psrli_w:
23637 case Intrinsic::x86_mmx_psrli_d:
23638 case Intrinsic::x86_mmx_psrli_q:
23639 case Intrinsic::x86_mmx_psrai_w:
23640 case Intrinsic::x86_mmx_psrai_d: {
23641 SDLoc DL(Op);
23642 SDValue ShAmt = Op.getOperand(2);
23643 // If the argument is a constant, convert it to a target constant.
23644 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
23645 // Clamp out of bounds shift amounts since they will otherwise be masked
23646 // to 8-bits which may make it no longer out of bounds.
23647 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
23648 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
23649 Op.getOperand(0), Op.getOperand(1),
23650 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
23651 }
23652
23653 unsigned NewIntrinsic;
23654 switch (IntNo) {
23655 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23655)
; // Can't reach here.
23656 case Intrinsic::x86_mmx_pslli_w:
23657 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
23658 break;
23659 case Intrinsic::x86_mmx_pslli_d:
23660 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
23661 break;
23662 case Intrinsic::x86_mmx_pslli_q:
23663 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
23664 break;
23665 case Intrinsic::x86_mmx_psrli_w:
23666 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
23667 break;
23668 case Intrinsic::x86_mmx_psrli_d:
23669 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
23670 break;
23671 case Intrinsic::x86_mmx_psrli_q:
23672 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
23673 break;
23674 case Intrinsic::x86_mmx_psrai_w:
23675 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
23676 break;
23677 case Intrinsic::x86_mmx_psrai_d:
23678 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
23679 break;
23680 }
23681
23682 // The vector shift intrinsics with scalars uses 32b shift amounts but
23683 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
23684 // MMX register.
23685 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
23686 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
23687 DAG.getConstant(NewIntrinsic, DL, MVT::i32),
23688 Op.getOperand(1), ShAmt);
23689
23690 }
23691 }
23692}
23693
23694static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
23695 SDValue Src, SDValue Mask, SDValue Base,
23696 SDValue Index, SDValue ScaleOp, SDValue Chain,
23697 const X86Subtarget &Subtarget) {
23698 SDLoc dl(Op);
23699 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
23700 // Scale must be constant.
23701 if (!C)
23702 return SDValue();
23703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23704 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
23705 TLI.getPointerTy(DAG.getDataLayout()));
23706 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
23707 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
23708 // If source is undef or we know it won't be used, use a zero vector
23709 // to break register dependency.
23710 // TODO: use undef instead and let BreakFalseDeps deal with it?
23711 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
23712 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
23713
23714 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
23715
23716 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
23717 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23718 VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
23719 return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
23720}
23721
23722static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
23723 SDValue Src, SDValue Mask, SDValue Base,
23724 SDValue Index, SDValue ScaleOp, SDValue Chain,
23725 const X86Subtarget &Subtarget) {
23726 MVT VT = Op.getSimpleValueType();
23727 SDLoc dl(Op);
23728 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
23729 // Scale must be constant.
23730 if (!C)
23731 return SDValue();
23732 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23733 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
23734 TLI.getPointerTy(DAG.getDataLayout()));
23735 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
23736 VT.getVectorNumElements());
23737 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
23738
23739 // We support two versions of the gather intrinsics. One with scalar mask and
23740 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
23741 if (Mask.getValueType() != MaskVT)
23742 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23743
23744 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
23745 // If source is undef or we know it won't be used, use a zero vector
23746 // to break register dependency.
23747 // TODO: use undef instead and let BreakFalseDeps deal with it?
23748 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
23749 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
23750
23751 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
23752
23753 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
23754 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23755 VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
23756 return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
23757}
23758
23759static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
23760 SDValue Src, SDValue Mask, SDValue Base,
23761 SDValue Index, SDValue ScaleOp, SDValue Chain,
23762 const X86Subtarget &Subtarget) {
23763 SDLoc dl(Op);
23764 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
23765 // Scale must be constant.
23766 if (!C)
23767 return SDValue();
23768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23769 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
23770 TLI.getPointerTy(DAG.getDataLayout()));
23771 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
23772 Src.getSimpleValueType().getVectorNumElements());
23773 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
23774
23775 // We support two versions of the scatter intrinsics. One with scalar mask and
23776 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
23777 if (Mask.getValueType() != MaskVT)
23778 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23779
23780 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
23781
23782 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
23783 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
23784 SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
23785 VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
23786 return Res.getValue(1);
23787}
23788
23789static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
23790 SDValue Mask, SDValue Base, SDValue Index,
23791 SDValue ScaleOp, SDValue Chain,
23792 const X86Subtarget &Subtarget) {
23793 SDLoc dl(Op);
23794 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
23795 // Scale must be constant.
23796 if (!C)
23797 return SDValue();
23798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23799 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
23800 TLI.getPointerTy(DAG.getDataLayout()));
23801 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
23802 SDValue Segment = DAG.getRegister(0, MVT::i32);
23803 MVT MaskVT =
23804 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
23805 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23806 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
23807 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
23808 return SDValue(Res, 0);
23809}
23810
23811/// Handles the lowering of builtin intrinsics with chain that return their
23812/// value into registers EDX:EAX.
23813/// If operand ScrReg is a valid register identifier, then operand 2 of N is
23814/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
23815/// TargetOpcode.
23816/// Returns a Glue value which can be used to add extra copy-from-reg if the
23817/// expanded intrinsics implicitly defines extra registers (i.e. not just
23818/// EDX:EAX).
23819static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
23820 SelectionDAG &DAG,
23821 unsigned TargetOpcode,
23822 unsigned SrcReg,
23823 const X86Subtarget &Subtarget,
23824 SmallVectorImpl<SDValue> &Results) {
23825 SDValue Chain = N->getOperand(0);
23826 SDValue Glue;
23827
23828 if (SrcReg) {
23829 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23829, __PRETTY_FUNCTION__))
;
23830 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
23831 Glue = Chain.getValue(1);
23832 }
23833
23834 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23835 SDValue N1Ops[] = {Chain, Glue};
23836 SDNode *N1 = DAG.getMachineNode(
23837 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
23838 Chain = SDValue(N1, 0);
23839
23840 // Reads the content of XCR and returns it in registers EDX:EAX.
23841 SDValue LO, HI;
23842 if (Subtarget.is64Bit()) {
23843 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
23844 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
23845 LO.getValue(2));
23846 } else {
23847 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
23848 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
23849 LO.getValue(2));
23850 }
23851 Chain = HI.getValue(1);
23852 Glue = HI.getValue(2);
23853
23854 if (Subtarget.is64Bit()) {
23855 // Merge the two 32-bit values into a 64-bit one.
23856 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
23857 DAG.getConstant(32, DL, MVT::i8));
23858 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
23859 Results.push_back(Chain);
23860 return Glue;
23861 }
23862
23863 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
23864 SDValue Ops[] = { LO, HI };
23865 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
23866 Results.push_back(Pair);
23867 Results.push_back(Chain);
23868 return Glue;
23869}
23870
23871/// Handles the lowering of builtin intrinsics that read the time stamp counter
23872/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
23873/// READCYCLECOUNTER nodes.
23874static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
23875 SelectionDAG &DAG,
23876 const X86Subtarget &Subtarget,
23877 SmallVectorImpl<SDValue> &Results) {
23878 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
23879 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
23880 // and the EAX register is loaded with the low-order 32 bits.
23881 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
23882 /* NoRegister */0, Subtarget,
23883 Results);
23884 if (Opcode != X86::RDTSCP)
23885 return;
23886
23887 SDValue Chain = Results[1];
23888 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
23889 // the ECX register. Add 'ecx' explicitly to the chain.
23890 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
23891 Results[1] = ecx;
23892 Results.push_back(ecx.getValue(1));
23893}
23894
23895static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
23896 SelectionDAG &DAG) {
23897 SmallVector<SDValue, 3> Results;
23898 SDLoc DL(Op);
23899 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
23900 Results);
23901 return DAG.getMergeValues(Results, DL);
23902}
23903
23904static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
23905 MachineFunction &MF = DAG.getMachineFunction();
23906 SDValue Chain = Op.getOperand(0);
23907 SDValue RegNode = Op.getOperand(2);
23908 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
23909 if (!EHInfo)
23910 report_fatal_error("EH registrations only live in functions using WinEH");
23911
23912 // Cast the operand to an alloca, and remember the frame index.
23913 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
23914 if (!FINode)
23915 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
23916 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
23917
23918 // Return the chain operand without making any DAG nodes.
23919 return Chain;
23920}
23921
23922static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
23923 MachineFunction &MF = DAG.getMachineFunction();
23924 SDValue Chain = Op.getOperand(0);
23925 SDValue EHGuard = Op.getOperand(2);
23926 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
23927 if (!EHInfo)
23928 report_fatal_error("EHGuard only live in functions using WinEH");
23929
23930 // Cast the operand to an alloca, and remember the frame index.
23931 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
23932 if (!FINode)
23933 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
23934 EHInfo->EHGuardFrameIndex = FINode->getIndex();
23935
23936 // Return the chain operand without making any DAG nodes.
23937 return Chain;
23938}
23939
23940/// Emit Truncating Store with signed or unsigned saturation.
23941static SDValue
23942EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
23943 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
23944 SelectionDAG &DAG) {
23945
23946 SDVTList VTs = DAG.getVTList(MVT::Other);
23947 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
23948 SDValue Ops[] = { Chain, Val, Ptr, Undef };
23949 return SignedSat ?
23950 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
23951 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
23952}
23953
23954/// Emit Masked Truncating Store with signed or unsigned saturation.
23955static SDValue
23956EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
23957 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
23958 MachineMemOperand *MMO, SelectionDAG &DAG) {
23959
23960 SDVTList VTs = DAG.getVTList(MVT::Other);
23961 SDValue Ops[] = { Chain, Val, Ptr, Mask };
23962 return SignedSat ?
23963 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
23964 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
23965}
23966
23967static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
23968 SelectionDAG &DAG) {
23969 unsigned IntNo = Op.getConstantOperandVal(1);
23970 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
23971 if (!IntrData) {
23972 switch (IntNo) {
23973 case llvm::Intrinsic::x86_seh_ehregnode:
23974 return MarkEHRegistrationNode(Op, DAG);
23975 case llvm::Intrinsic::x86_seh_ehguard:
23976 return MarkEHGuard(Op, DAG);
23977 case llvm::Intrinsic::x86_rdpkru: {
23978 SDLoc dl(Op);
23979 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
23980 // Create a RDPKRU node and pass 0 to the ECX parameter.
23981 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
23982 DAG.getConstant(0, dl, MVT::i32));
23983 }
23984 case llvm::Intrinsic::x86_wrpkru: {
23985 SDLoc dl(Op);
23986 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
23987 // to the EDX and ECX parameters.
23988 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
23989 Op.getOperand(0), Op.getOperand(2),
23990 DAG.getConstant(0, dl, MVT::i32),
23991 DAG.getConstant(0, dl, MVT::i32));
23992 }
23993 case llvm::Intrinsic::x86_flags_read_u32:
23994 case llvm::Intrinsic::x86_flags_read_u64:
23995 case llvm::Intrinsic::x86_flags_write_u32:
23996 case llvm::Intrinsic::x86_flags_write_u64: {
23997 // We need a frame pointer because this will get lowered to a PUSH/POP
23998 // sequence.
23999 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
24000 MFI.setHasCopyImplyingStackAdjustment(true);
24001 // Don't do anything here, we will expand these intrinsics out later
24002 // during FinalizeISel in EmitInstrWithCustomInserter.
24003 return SDValue();
24004 }
24005 case Intrinsic::x86_lwpins32:
24006 case Intrinsic::x86_lwpins64:
24007 case Intrinsic::x86_umwait:
24008 case Intrinsic::x86_tpause: {
24009 SDLoc dl(Op);
24010 SDValue Chain = Op->getOperand(0);
24011 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
24012 unsigned Opcode;
24013
24014 switch (IntNo) {
24015 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24015)
;
24016 case Intrinsic::x86_umwait:
24017 Opcode = X86ISD::UMWAIT;
24018 break;
24019 case Intrinsic::x86_tpause:
24020 Opcode = X86ISD::TPAUSE;
24021 break;
24022 case Intrinsic::x86_lwpins32:
24023 case Intrinsic::x86_lwpins64:
24024 Opcode = X86ISD::LWPINS;
24025 break;
24026 }
24027
24028 SDValue Operation =
24029 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
24030 Op->getOperand(3), Op->getOperand(4));
24031 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
24032 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
24033 Operation.getValue(1));
24034 }
24035 case Intrinsic::x86_enqcmd:
24036 case Intrinsic::x86_enqcmds: {
24037 SDLoc dl(Op);
24038 SDValue Chain = Op.getOperand(0);
24039 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
24040 unsigned Opcode;
24041 switch (IntNo) {
24042 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24042)
;
24043 case Intrinsic::x86_enqcmd:
24044 Opcode = X86ISD::ENQCMD;
24045 break;
24046 case Intrinsic::x86_enqcmds:
24047 Opcode = X86ISD::ENQCMDS;
24048 break;
24049 }
24050 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
24051 Op.getOperand(3));
24052 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
24053 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
24054 Operation.getValue(1));
24055 }
24056 }
24057 return SDValue();
24058 }
24059
24060 SDLoc dl(Op);
24061 switch(IntrData->Type) {
24062 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24062)
;
24063 case RDSEED:
24064 case RDRAND: {
24065 // Emit the node with the right value type.
24066 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
24067 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
24068
24069 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
24070 // Otherwise return the value from Rand, which is always 0, casted to i32.
24071 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
24072 DAG.getConstant(1, dl, Op->getValueType(1)),
24073 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
24074 SDValue(Result.getNode(), 1)};
24075 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
24076
24077 // Return { result, isValid, chain }.
24078 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
24079 SDValue(Result.getNode(), 2));
24080 }
24081 case GATHER_AVX2: {
24082 SDValue Chain = Op.getOperand(0);
24083 SDValue Src = Op.getOperand(2);
24084 SDValue Base = Op.getOperand(3);
24085 SDValue Index = Op.getOperand(4);
24086 SDValue Mask = Op.getOperand(5);
24087 SDValue Scale = Op.getOperand(6);
24088 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
24089 Scale, Chain, Subtarget);
24090 }
24091 case GATHER: {
24092 //gather(v1, mask, index, base, scale);
24093 SDValue Chain = Op.getOperand(0);
24094 SDValue Src = Op.getOperand(2);
24095 SDValue Base = Op.getOperand(3);
24096 SDValue Index = Op.getOperand(4);
24097 SDValue Mask = Op.getOperand(5);
24098 SDValue Scale = Op.getOperand(6);
24099 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
24100 Chain, Subtarget);
24101 }
24102 case SCATTER: {
24103 //scatter(base, mask, index, v1, scale);
24104 SDValue Chain = Op.getOperand(0);
24105 SDValue Base = Op.getOperand(2);
24106 SDValue Mask = Op.getOperand(3);
24107 SDValue Index = Op.getOperand(4);
24108 SDValue Src = Op.getOperand(5);
24109 SDValue Scale = Op.getOperand(6);
24110 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
24111 Scale, Chain, Subtarget);
24112 }
24113 case PREFETCH: {
24114 const APInt &HintVal = Op.getConstantOperandAPInt(6);
24115 assert((HintVal == 2 || HintVal == 3) &&(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24116, __PRETTY_FUNCTION__))
24116 "Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24116, __PRETTY_FUNCTION__))
;
24117 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
24118 SDValue Chain = Op.getOperand(0);
24119 SDValue Mask = Op.getOperand(2);
24120 SDValue Index = Op.getOperand(3);
24121 SDValue Base = Op.getOperand(4);
24122 SDValue Scale = Op.getOperand(5);
24123 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
24124 Subtarget);
24125 }
24126 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
24127 case RDTSC: {
24128 SmallVector<SDValue, 2> Results;
24129 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
24130 Results);
24131 return DAG.getMergeValues(Results, dl);
24132 }
24133 // Read Performance Monitoring Counters.
24134 case RDPMC:
24135 // GetExtended Control Register.
24136 case XGETBV: {
24137 SmallVector<SDValue, 2> Results;
24138
24139 // RDPMC uses ECX to select the index of the performance counter to read.
24140 // XGETBV uses ECX to select the index of the XCR register to return.
24141 // The result is stored into registers EDX:EAX.
24142 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
24143 Subtarget, Results);
24144 return DAG.getMergeValues(Results, dl);
24145 }
24146 // XTEST intrinsics.
24147 case XTEST: {
24148 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
24149 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
24150
24151 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
24152 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
24153 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
24154 Ret, SDValue(InTrans.getNode(), 1));
24155 }
24156 case TRUNCATE_TO_MEM_VI8:
24157 case TRUNCATE_TO_MEM_VI16:
24158 case TRUNCATE_TO_MEM_VI32: {
24159 SDValue Mask = Op.getOperand(4);
24160 SDValue DataToTruncate = Op.getOperand(3);
24161 SDValue Addr = Op.getOperand(2);
24162 SDValue Chain = Op.getOperand(0);
24163
24164 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
24165 assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast
<void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24165, __PRETTY_FUNCTION__))
;
24166
24167 EVT MemVT = MemIntr->getMemoryVT();
24168
24169 uint16_t TruncationOp = IntrData->Opc0;
24170 switch (TruncationOp) {
24171 case X86ISD::VTRUNC: {
24172 if (isAllOnesConstant(Mask)) // return just a truncate store
24173 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
24174 MemIntr->getMemOperand());
24175
24176 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
24177 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24178
24179 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
24180 MemIntr->getMemOperand(), true /* truncating */);
24181 }
24182 case X86ISD::VTRUNCUS:
24183 case X86ISD::VTRUNCS: {
24184 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
24185 if (isAllOnesConstant(Mask))
24186 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
24187 MemIntr->getMemOperand(), DAG);
24188
24189 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
24190 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24191
24192 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
24193 VMask, MemVT, MemIntr->getMemOperand(), DAG);
24194 }
24195 default:
24196 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24196)
;
24197 }
24198 }
24199 }
24200}
24201
24202SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
24203 SelectionDAG &DAG) const {
24204 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
24205 MFI.setReturnAddressIsTaken(true);
24206
24207 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
24208 return SDValue();
24209
24210 unsigned Depth = Op.getConstantOperandVal(0);
24211 SDLoc dl(Op);
24212 EVT PtrVT = getPointerTy(DAG.getDataLayout());
24213
24214 if (Depth > 0) {
24215 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
24216 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24217 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
24218 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
24219 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
24220 MachinePointerInfo());
24221 }
24222
24223 // Just load the return address.
24224 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
24225 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
24226 MachinePointerInfo());
24227}
24228
24229SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
24230 SelectionDAG &DAG) const {
24231 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
24232 return getReturnAddressFrameIndex(DAG);
24233}
24234
24235SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
24236 MachineFunction &MF = DAG.getMachineFunction();
24237 MachineFrameInfo &MFI = MF.getFrameInfo();
24238 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24239 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24240 EVT VT = Op.getValueType();
24241
24242 MFI.setFrameAddressIsTaken(true);
24243
24244 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
24245 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
24246 // is not possible to crawl up the stack without looking at the unwind codes
24247 // simultaneously.
24248 int FrameAddrIndex = FuncInfo->getFAIndex();
24249 if (!FrameAddrIndex) {
24250 // Set up a frame object for the return address.
24251 unsigned SlotSize = RegInfo->getSlotSize();
24252 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
24253 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
24254 FuncInfo->setFAIndex(FrameAddrIndex);
24255 }
24256 return DAG.getFrameIndex(FrameAddrIndex, VT);
24257 }
24258
24259 unsigned FrameReg =
24260 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
24261 SDLoc dl(Op); // FIXME probably not meaningful
24262 unsigned Depth = Op.getConstantOperandVal(0);
24263 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24265, __PRETTY_FUNCTION__))
24264 (FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24265, __PRETTY_FUNCTION__))
24265 "Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24265, __PRETTY_FUNCTION__))
;
24266 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
24267 while (Depth--)
24268 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
24269 MachinePointerInfo());
24270 return FrameAddr;
24271}
24272
24273// FIXME? Maybe this could be a TableGen attribute on some registers and
24274// this table could be generated automatically from RegInfo.
24275Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
24276 const MachineFunction &MF) const {
24277 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24278
24279 Register Reg = StringSwitch<unsigned>(RegName)
24280 .Case("esp", X86::ESP)
24281 .Case("rsp", X86::RSP)
24282 .Case("ebp", X86::EBP)
24283 .Case("rbp", X86::RBP)
24284 .Default(0);
24285
24286 if (Reg == X86::EBP || Reg == X86::RBP) {
24287 if (!TFI.hasFP(MF))
24288 report_fatal_error("register " + StringRef(RegName) +
24289 " is allocatable: function has no frame pointer");
24290#ifndef NDEBUG
24291 else {
24292 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24293 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
24294 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24295, __PRETTY_FUNCTION__))
24295 "Invalid Frame Register!")(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24295, __PRETTY_FUNCTION__))
;
24296 }
24297#endif
24298 }
24299
24300 if (Reg)
24301 return Reg;
24302
24303 report_fatal_error("Invalid register name global variable");
24304}
24305
24306SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
24307 SelectionDAG &DAG) const {
24308 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24309 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
24310}
24311
24312unsigned X86TargetLowering::getExceptionPointerRegister(
24313 const Constant *PersonalityFn) const {
24314 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
24315 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
24316
24317 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
24318}
24319
24320unsigned X86TargetLowering::getExceptionSelectorRegister(
24321 const Constant *PersonalityFn) const {
24322 // Funclet personalities don't use selectors (the runtime does the selection).
24323 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn
))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24323, __PRETTY_FUNCTION__))
;
24324 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
24325}
24326
24327bool X86TargetLowering::needsFixedCatchObjects() const {
24328 return Subtarget.isTargetWin64();
24329}
24330
24331SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
24332 SDValue Chain = Op.getOperand(0);
24333 SDValue Offset = Op.getOperand(1);
24334 SDValue Handler = Op.getOperand(2);
24335 SDLoc dl (Op);
24336
24337 EVT PtrVT = getPointerTy(DAG.getDataLayout());
24338 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24339 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
24340 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24342, __PRETTY_FUNCTION__))
24341 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24342, __PRETTY_FUNCTION__))
24342 "Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24342, __PRETTY_FUNCTION__))
;
24343 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
24344 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
24345
24346 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
24347 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
24348 dl));
24349 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
24350 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
24351 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
24352
24353 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
24354 DAG.getRegister(StoreAddrReg, PtrVT));
24355}
24356
24357SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
24358 SelectionDAG &DAG) const {
24359 SDLoc DL(Op);
24360 // If the subtarget is not 64bit, we may need the global base reg
24361 // after isel expand pseudo, i.e., after CGBR pass ran.
24362 // Therefore, ask for the GlobalBaseReg now, so that the pass
24363 // inserts the code for us in case we need it.
24364 // Otherwise, we will end up in a situation where we will
24365 // reference a virtual register that is not defined!
24366 if (!Subtarget.is64Bit()) {
24367 const X86InstrInfo *TII = Subtarget.getInstrInfo();
24368 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
24369 }
24370 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
24371 DAG.getVTList(MVT::i32, MVT::Other),
24372 Op.getOperand(0), Op.getOperand(1));
24373}
24374
24375SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
24376 SelectionDAG &DAG) const {
24377 SDLoc DL(Op);
24378 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
24379 Op.getOperand(0), Op.getOperand(1));
24380}
24381
24382SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
24383 SelectionDAG &DAG) const {
24384 SDLoc DL(Op);
24385 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
24386 Op.getOperand(0));
24387}
24388
24389static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
24390 return Op.getOperand(0);
24391}
24392
24393SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
24394 SelectionDAG &DAG) const {
24395 SDValue Root = Op.getOperand(0);
24396 SDValue Trmp = Op.getOperand(1); // trampoline
24397 SDValue FPtr = Op.getOperand(2); // nested function
24398 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
24399 SDLoc dl (Op);
24400
24401 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24402 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24403
24404 if (Subtarget.is64Bit()) {
24405 SDValue OutChains[6];
24406
24407 // Large code-model.
24408 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
24409 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
24410
24411 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
24412 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
24413
24414 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
24415
24416 // Load the pointer to the nested function into R11.
24417 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
24418 SDValue Addr = Trmp;
24419 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
24420 Addr, MachinePointerInfo(TrmpAddr));
24421
24422 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
24423 DAG.getConstant(2, dl, MVT::i64));
24424 OutChains[1] =
24425 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
24426 /* Alignment = */ 2);
24427
24428 // Load the 'nest' parameter value into R10.
24429 // R10 is specified in X86CallingConv.td
24430 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
24431 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
24432 DAG.getConstant(10, dl, MVT::i64));
24433 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
24434 Addr, MachinePointerInfo(TrmpAddr, 10));
24435
24436 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
24437 DAG.getConstant(12, dl, MVT::i64));
24438 OutChains[3] =
24439 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
24440 /* Alignment = */ 2);
24441
24442 // Jump to the nested function.
24443 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
24444 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
24445 DAG.getConstant(20, dl, MVT::i64));
24446 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
24447 Addr, MachinePointerInfo(TrmpAddr, 20));
24448
24449 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
24450 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
24451 DAG.getConstant(22, dl, MVT::i64));
24452 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
24453 Addr, MachinePointerInfo(TrmpAddr, 22));
24454
24455 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
24456 } else {
24457 const Function *Func =
24458 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
24459 CallingConv::ID CC = Func->getCallingConv();
24460 unsigned NestReg;
24461
24462 switch (CC) {
24463 default:
24464 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24464)
;
24465 case CallingConv::C:
24466 case CallingConv::X86_StdCall: {
24467 // Pass 'nest' parameter in ECX.
24468 // Must be kept in sync with X86CallingConv.td
24469 NestReg = X86::ECX;
24470
24471 // Check that ECX wasn't needed by an 'inreg' parameter.
24472 FunctionType *FTy = Func->getFunctionType();
24473 const AttributeList &Attrs = Func->getAttributes();
24474
24475 if (!Attrs.isEmpty() && !Func->isVarArg()) {
24476 unsigned InRegCount = 0;
24477 unsigned Idx = 1;
24478
24479 for (FunctionType::param_iterator I = FTy->param_begin(),
24480 E = FTy->param_end(); I != E; ++I, ++Idx)
24481 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
24482 auto &DL = DAG.getDataLayout();
24483 // FIXME: should only count parameters that are lowered to integers.
24484 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
24485 }
24486
24487 if (InRegCount > 2) {
24488 report_fatal_error("Nest register in use - reduce number of inreg"
24489 " parameters!");
24490 }
24491 }
24492 break;
24493 }
24494 case CallingConv::X86_FastCall:
24495 case CallingConv::X86_ThisCall:
24496 case CallingConv::Fast:
24497 case CallingConv::Tail:
24498 // Pass 'nest' parameter in EAX.
24499 // Must be kept in sync with X86CallingConv.td
24500 NestReg = X86::EAX;
24501 break;
24502 }
24503
24504 SDValue OutChains[4];
24505 SDValue Addr, Disp;
24506
24507 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
24508 DAG.getConstant(10, dl, MVT::i32));
24509 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
24510
24511 // This is storing the opcode for MOV32ri.
24512 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
24513 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
24514 OutChains[0] =
24515 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
24516 Trmp, MachinePointerInfo(TrmpAddr));
24517
24518 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
24519 DAG.getConstant(1, dl, MVT::i32));
24520 OutChains[1] =
24521 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
24522 /* Alignment = */ 1);
24523
24524 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
24525 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
24526 DAG.getConstant(5, dl, MVT::i32));
24527 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
24528 Addr, MachinePointerInfo(TrmpAddr, 5),
24529 /* Alignment = */ 1);
24530
24531 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
24532 DAG.getConstant(6, dl, MVT::i32));
24533 OutChains[3] =
24534 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
24535 /* Alignment = */ 1);
24536
24537 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
24538 }
24539}
24540
24541SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
24542 SelectionDAG &DAG) const {
24543 /*
24544 The rounding mode is in bits 11:10 of FPSR, and has the following
24545 settings:
24546 00 Round to nearest
24547 01 Round to -inf
24548 10 Round to +inf
24549 11 Round to 0
24550
24551 FLT_ROUNDS, on the other hand, expects the following:
24552 -1 Undefined
24553 0 Round to 0
24554 1 Round to nearest
24555 2 Round to +inf
24556 3 Round to -inf
24557
24558 To perform the conversion, we do:
24559 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
24560 */
24561
24562 MachineFunction &MF = DAG.getMachineFunction();
24563 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24564 const Align StackAlignment(TFI.getStackAlignment());
24565 MVT VT = Op.getSimpleValueType();
24566 SDLoc DL(Op);
24567
24568 // Save FP Control Word to stack slot
24569 int SSFI =
24570 MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
24571 SDValue StackSlot =
24572 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
24573
24574 MachineMemOperand *MMO =
24575 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
24576 MachineMemOperand::MOStore, 2, 2);
24577
24578 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
24579 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
24580 DAG.getVTList(MVT::Other),
24581 Ops, MVT::i16, MMO);
24582
24583 // Load FP Control Word from stack slot
24584 SDValue CWD =
24585 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
24586
24587 // Transform as necessary
24588 SDValue CWD1 =
24589 DAG.getNode(ISD::SRL, DL, MVT::i16,
24590 DAG.getNode(ISD::AND, DL, MVT::i16,
24591 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
24592 DAG.getConstant(11, DL, MVT::i8));
24593 SDValue CWD2 =
24594 DAG.getNode(ISD::SRL, DL, MVT::i16,
24595 DAG.getNode(ISD::AND, DL, MVT::i16,
24596 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
24597 DAG.getConstant(9, DL, MVT::i8));
24598
24599 SDValue RetVal =
24600 DAG.getNode(ISD::AND, DL, MVT::i16,
24601 DAG.getNode(ISD::ADD, DL, MVT::i16,
24602 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
24603 DAG.getConstant(1, DL, MVT::i16)),
24604 DAG.getConstant(3, DL, MVT::i16));
24605
24606 return DAG.getNode((VT.getSizeInBits() < 16 ?
24607 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
24608}
24609
24610// Split an unary integer op into 2 half sized ops.
24611static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
24612 MVT VT = Op.getSimpleValueType();
24613 unsigned NumElems = VT.getVectorNumElements();
24614 unsigned SizeInBits = VT.getSizeInBits();
24615 MVT EltVT = VT.getVectorElementType();
24616 SDValue Src = Op.getOperand(0);
24617 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24618, __PRETTY_FUNCTION__))
24618 "Src and Op should have the same element type!")((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24618, __PRETTY_FUNCTION__))
;
24619
24620 // Extract the Lo/Hi vectors
24621 SDLoc dl(Op);
24622 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
24623 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
24624
24625 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
24626 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
24627 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
24628 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
24629}
24630
24631// Decompose 256-bit ops into smaller 128-bit ops.
24632static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
24633 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24635, __PRETTY_FUNCTION__))
24634 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24635, __PRETTY_FUNCTION__))
24635 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24635, __PRETTY_FUNCTION__))
;
24636 return LowerVectorIntUnary(Op, DAG);
24637}
24638
24639// Decompose 512-bit ops into smaller 256-bit ops.
24640static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
24641 assert(Op.getSimpleValueType().is512BitVector() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24643, __PRETTY_FUNCTION__))
24642 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24643, __PRETTY_FUNCTION__))
24643 "Only handle AVX 512-bit vector integer operation")((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24643, __PRETTY_FUNCTION__))
;
24644 return LowerVectorIntUnary(Op, DAG);
24645}
24646
24647/// Lower a vector CTLZ using native supported vector CTLZ instruction.
24648//
24649// i8/i16 vector implemented using dword LZCNT vector instruction
24650// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
24651// split the vector, perform operation on it's Lo a Hi part and
24652// concatenate the results.
24653static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
24654 const X86Subtarget &Subtarget) {
24655 assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) :
__assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24655, __PRETTY_FUNCTION__))
;
24656 SDLoc dl(Op);
24657 MVT VT = Op.getSimpleValueType();
24658 MVT EltVT = VT.getVectorElementType();
24659 unsigned NumElems = VT.getVectorNumElements();
24660
24661 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24662, __PRETTY_FUNCTION__))
24662 "Unsupported element type")(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24662, __PRETTY_FUNCTION__))
;
24663
24664 // Split vector, it's Lo and Hi parts will be handled in next iteration.
24665 if (NumElems > 16 ||
24666 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
24667 return LowerVectorIntUnary(Op, DAG);
24668
24669 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24670 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24671, __PRETTY_FUNCTION__))
24671 "Unsupported value type for operation")(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24671, __PRETTY_FUNCTION__))
;
24672
24673 // Use native supported vector instruction vplzcntd.
24674 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
24675 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
24676 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
24677 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
24678
24679 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
24680}
24681
24682// Lower CTLZ using a PSHUFB lookup table implementation.
24683static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
24684 const X86Subtarget &Subtarget,
24685 SelectionDAG &DAG) {
24686 MVT VT = Op.getSimpleValueType();
24687 int NumElts = VT.getVectorNumElements();
24688 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
24689 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
24690
24691 // Per-nibble leading zero PSHUFB lookup table.
24692 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
24693 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
24694 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
24695 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
24696
24697 SmallVector<SDValue, 64> LUTVec;
24698 for (int i = 0; i < NumBytes; ++i)
24699 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24700 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
24701
24702 // Begin by bitcasting the input to byte vector, then split those bytes
24703 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
24704 // If the hi input nibble is zero then we add both results together, otherwise
24705 // we just take the hi result (by masking the lo result to zero before the
24706 // add).
24707 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
24708 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
24709
24710 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
24711 SDValue Lo = Op0;
24712 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
24713 SDValue HiZ;
24714 if (CurrVT.is512BitVector()) {
24715 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
24716 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
24717 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
24718 } else {
24719 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
24720 }
24721
24722 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
24723 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
24724 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
24725 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
24726
24727 // Merge result back from vXi8 back to VT, working on the lo/hi halves
24728 // of the current vector width in the same way we did for the nibbles.
24729 // If the upper half of the input element is zero then add the halves'
24730 // leading zero counts together, otherwise just use the upper half's.
24731 // Double the width of the result until we are at target width.
24732 while (CurrVT != VT) {
24733 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
24734 int CurrNumElts = CurrVT.getVectorNumElements();
24735 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
24736 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
24737 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
24738
24739 // Check if the upper half of the input element is zero.
24740 if (CurrVT.is512BitVector()) {
24741 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
24742 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
24743 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
24744 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
24745 } else {
24746 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
24747 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
24748 }
24749 HiZ = DAG.getBitcast(NextVT, HiZ);
24750
24751 // Move the upper/lower halves to the lower bits as we'll be extending to
24752 // NextVT. Mask the lower result to zero if HiZ is true and add the results
24753 // together.
24754 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
24755 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
24756 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
24757 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
24758 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
24759 CurrVT = NextVT;
24760 }
24761
24762 return Res;
24763}
24764
24765static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
24766 const X86Subtarget &Subtarget,
24767 SelectionDAG &DAG) {
24768 MVT VT = Op.getSimpleValueType();
24769
24770 if (Subtarget.hasCDI() &&
24771 // vXi8 vectors need to be promoted to 512-bits for vXi32.
24772 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
24773 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
24774
24775 // Decompose 256-bit ops into smaller 128-bit ops.
24776 if (VT.is256BitVector() && !Subtarget.hasInt256())
24777 return Lower256IntUnary(Op, DAG);
24778
24779 // Decompose 512-bit ops into smaller 256-bit ops.
24780 if (VT.is512BitVector() && !Subtarget.hasBWI())
24781 return Lower512IntUnary(Op, DAG);
24782
24783 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24783, __PRETTY_FUNCTION__))
;
24784 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
24785}
24786
24787static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
24788 SelectionDAG &DAG) {
24789 MVT VT = Op.getSimpleValueType();
24790 MVT OpVT = VT;
24791 unsigned NumBits = VT.getSizeInBits();
24792 SDLoc dl(Op);
24793 unsigned Opc = Op.getOpcode();
24794
24795 if (VT.isVector())
24796 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
24797
24798 Op = Op.getOperand(0);
24799 if (VT == MVT::i8) {
24800 // Zero extend to i32 since there is not an i8 bsr.
24801 OpVT = MVT::i32;
24802 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
24803 }
24804
24805 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
24806 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
24807 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
24808
24809 if (Opc == ISD::CTLZ) {
24810 // If src is zero (i.e. bsr sets ZF), returns NumBits.
24811 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
24812 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
24813 Op.getValue(1)};
24814 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
24815 }
24816
24817 // Finally xor with NumBits-1.
24818 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
24819 DAG.getConstant(NumBits - 1, dl, OpVT));
24820
24821 if (VT == MVT::i8)
24822 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
24823 return Op;
24824}
24825
24826static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
24827 SelectionDAG &DAG) {
24828 MVT VT = Op.getSimpleValueType();
24829 unsigned NumBits = VT.getScalarSizeInBits();
24830 SDValue N0 = Op.getOperand(0);
24831 SDLoc dl(Op);
24832
24833 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24834, __PRETTY_FUNCTION__))
24834 "Only scalar CTTZ requires custom lowering")((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24834, __PRETTY_FUNCTION__))
;
24835
24836 // Issue a bsf (scan bits forward) which also sets EFLAGS.
24837 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24838 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
24839
24840 // If src is zero (i.e. bsf sets ZF), returns NumBits.
24841 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
24842 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
24843 Op.getValue(1)};
24844 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
24845}
24846
24847/// Break a 256-bit integer operation into two new 128-bit ones and then
24848/// concatenate the result back.
24849static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
24850 MVT VT = Op.getSimpleValueType();
24851
24852 assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24853, __PRETTY_FUNCTION__))
24853 "Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24853, __PRETTY_FUNCTION__))
;
24854
24855 unsigned NumElems = VT.getVectorNumElements();
24856 SDLoc dl(Op);
24857
24858 // Extract the LHS vectors
24859 SDValue LHS = Op.getOperand(0);
24860 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
24861 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
24862
24863 // Extract the RHS vectors
24864 SDValue RHS = Op.getOperand(1);
24865 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
24866 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
24867
24868 MVT EltVT = VT.getVectorElementType();
24869 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
24870
24871 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
24872 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
24873 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
24874}
24875
24876/// Break a 512-bit integer operation into two new 256-bit ones and then
24877/// concatenate the result back.
24878static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
24879 MVT VT = Op.getSimpleValueType();
24880
24881 assert(VT.is512BitVector() && VT.isInteger() &&((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24882, __PRETTY_FUNCTION__))
24882 "Unsupported value type for operation")((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24882, __PRETTY_FUNCTION__))
;
24883
24884 unsigned NumElems = VT.getVectorNumElements();
24885 SDLoc dl(Op);
24886
24887 // Extract the LHS vectors
24888 SDValue LHS = Op.getOperand(0);
24889 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
24890 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
24891
24892 // Extract the RHS vectors
24893 SDValue RHS = Op.getOperand(1);
24894 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
24895 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
24896
24897 MVT EltVT = VT.getVectorElementType();
24898 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
24899
24900 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
24901 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
24902 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
24903}
24904
24905static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
24906 const X86Subtarget &Subtarget) {
24907 MVT VT = Op.getSimpleValueType();
24908 if (VT == MVT::i16 || VT == MVT::i32)
24909 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
24910
24911 if (VT.getScalarType() == MVT::i1)
24912 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
24913 Op.getOperand(0), Op.getOperand(1));
24914
24915 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24917, __PRETTY_FUNCTION__))
24916 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24917, __PRETTY_FUNCTION__))
24917 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24917, __PRETTY_FUNCTION__))
;
24918 return split256IntArith(Op, DAG);
24919}
24920
24921static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
24922 const X86Subtarget &Subtarget) {
24923 MVT VT = Op.getSimpleValueType();
24924 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
24925 unsigned Opcode = Op.getOpcode();
24926 if (VT.getScalarType() == MVT::i1) {
24927 SDLoc dl(Op);
24928 switch (Opcode) {
24929 default: llvm_unreachable("Expected saturated arithmetic opcode")::llvm::llvm_unreachable_internal("Expected saturated arithmetic opcode"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24929)
;
24930 case ISD::UADDSAT:
24931 case ISD::SADDSAT:
24932 // *addsat i1 X, Y --> X | Y
24933 return DAG.getNode(ISD::OR, dl, VT, X, Y);
24934 case ISD::USUBSAT:
24935 case ISD::SSUBSAT:
24936 // *subsat i1 X, Y --> X & ~Y
24937 return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
24938 }
24939 }
24940
24941 if (VT.is128BitVector()) {
24942 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
24943 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24944 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
24945 *DAG.getContext(), VT);
24946 SDLoc DL(Op);
24947 if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
24948 // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
24949 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
24950 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
24951 return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
24952 }
24953 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
24954 // usubsat X, Y --> (X >u Y) ? X - Y : 0
24955 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
24956 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
24957 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
24958 }
24959 // Use default expansion.
24960 return SDValue();
24961 }
24962
24963 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24965, __PRETTY_FUNCTION__))
24964 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24965, __PRETTY_FUNCTION__))
24965 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24965, __PRETTY_FUNCTION__))
;
24966 return split256IntArith(Op, DAG);
24967}
24968
24969static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
24970 SelectionDAG &DAG) {
24971 MVT VT = Op.getSimpleValueType();
24972 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
24973 // Since X86 does not have CMOV for 8-bit integer, we don't convert
24974 // 8-bit integer abs to NEG and CMOV.
24975 SDLoc DL(Op);
24976 SDValue N0 = Op.getOperand(0);
24977 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24978 DAG.getConstant(0, DL, VT), N0);
24979 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
24980 SDValue(Neg.getNode(), 1)};
24981 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
24982 }
24983
24984 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
24985 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
24986 SDLoc DL(Op);
24987 SDValue Src = Op.getOperand(0);
24988 SDValue Sub =
24989 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
24990 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
24991 }
24992
24993 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
24994 assert(VT.isInteger() &&((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24995, __PRETTY_FUNCTION__))
24995 "Only handle AVX 256-bit vector integer operation")((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24995, __PRETTY_FUNCTION__))
;
24996 return Lower256IntUnary(Op, DAG);
24997 }
24998
24999 // Default to expand.
25000 return SDValue();
25001}
25002
25003static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
25004 MVT VT = Op.getSimpleValueType();
25005
25006 // For AVX1 cases, split to use legal ops (everything but v4i64).
25007 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
25008 return split256IntArith(Op, DAG);
25009
25010 SDLoc DL(Op);
25011 unsigned Opcode = Op.getOpcode();
25012 SDValue N0 = Op.getOperand(0);
25013 SDValue N1 = Op.getOperand(1);
25014
25015 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
25016 // using the SMIN/SMAX instructions and flipping the signbit back.
25017 if (VT == MVT::v8i16) {
25018 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25019, __PRETTY_FUNCTION__))
25019 "Unexpected MIN/MAX opcode")(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25019, __PRETTY_FUNCTION__))
;
25020 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
25021 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
25022 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
25023 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
25024 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
25025 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
25026 }
25027
25028 // Else, expand to a compare/select.
25029 ISD::CondCode CC;
25030 switch (Opcode) {
25031 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
25032 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
25033 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
25034 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
25035 default: llvm_unreachable("Unknown MINMAX opcode")::llvm::llvm_unreachable_internal("Unknown MINMAX opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25035)
;
25036 }
25037
25038 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
25039 return DAG.getSelect(DL, VT, Cond, N0, N1);
25040}
25041
25042static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
25043 SelectionDAG &DAG) {
25044 SDLoc dl(Op);
25045 MVT VT = Op.getSimpleValueType();
25046
25047 if (VT.getScalarType() == MVT::i1)
25048 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
25049
25050 // Decompose 256-bit ops into 128-bit ops.
25051 if (VT.is256BitVector() && !Subtarget.hasInt256())
25052 return split256IntArith(Op, DAG);
25053
25054 SDValue A = Op.getOperand(0);
25055 SDValue B = Op.getOperand(1);
25056
25057 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
25058 // vector pairs, multiply and truncate.
25059 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
25060 unsigned NumElts = VT.getVectorNumElements();
25061
25062 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
25063 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
25064 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
25065 return DAG.getNode(
25066 ISD::TRUNCATE, dl, VT,
25067 DAG.getNode(ISD::MUL, dl, ExVT,
25068 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
25069 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
25070 }
25071
25072 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
25073
25074 // Extract the lo/hi parts to any extend to i16.
25075 // We're going to mask off the low byte of each result element of the
25076 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
25077 // element.
25078 SDValue Undef = DAG.getUNDEF(VT);
25079 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
25080 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
25081
25082 SDValue BLo, BHi;
25083 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
25084 // If the LHS is a constant, manually unpackl/unpackh.
25085 SmallVector<SDValue, 16> LoOps, HiOps;
25086 for (unsigned i = 0; i != NumElts; i += 16) {
25087 for (unsigned j = 0; j != 8; ++j) {
25088 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
25089 MVT::i16));
25090 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
25091 MVT::i16));
25092 }
25093 }
25094
25095 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
25096 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
25097 } else {
25098 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
25099 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
25100 }
25101
25102 // Multiply, mask the lower 8bits of the lo/hi results and pack.
25103 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
25104 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
25105 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
25106 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
25107 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
25108 }
25109
25110 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
25111 if (VT == MVT::v4i32) {
25112 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25113, __PRETTY_FUNCTION__))
25113 "Should not custom lower when pmulld is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25113, __PRETTY_FUNCTION__))
;
25114
25115 // Extract the odd parts.
25116 static const int UnpackMask[] = { 1, -1, 3, -1 };
25117 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
25118 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
25119
25120 // Multiply the even parts.
25121 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
25122 DAG.getBitcast(MVT::v2i64, A),
25123 DAG.getBitcast(MVT::v2i64, B));
25124 // Now multiply odd parts.
25125 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
25126 DAG.getBitcast(MVT::v2i64, Aodds),
25127 DAG.getBitcast(MVT::v2i64, Bodds));
25128
25129 Evens = DAG.getBitcast(VT, Evens);
25130 Odds = DAG.getBitcast(VT, Odds);
25131
25132 // Merge the two vectors back together with a shuffle. This expands into 2
25133 // shuffles.
25134 static const int ShufMask[] = { 0, 4, 2, 6 };
25135 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
25136 }
25137
25138 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25139, __PRETTY_FUNCTION__))
25139 "Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25139, __PRETTY_FUNCTION__))
;
25140 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((!Subtarget.hasDQI() && "DQI should use MULLQ") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25140, __PRETTY_FUNCTION__))
;
25141
25142 // Ahi = psrlqi(a, 32);
25143 // Bhi = psrlqi(b, 32);
25144 //
25145 // AloBlo = pmuludq(a, b);
25146 // AloBhi = pmuludq(a, Bhi);
25147 // AhiBlo = pmuludq(Ahi, b);
25148 //
25149 // Hi = psllqi(AloBhi + AhiBlo, 32);
25150 // return AloBlo + Hi;
25151 KnownBits AKnown = DAG.computeKnownBits(A);
25152 KnownBits BKnown = DAG.computeKnownBits(B);
25153
25154 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
25155 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
25156 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
25157
25158 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
25159 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
25160 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
25161
25162 SDValue Zero = DAG.getConstant(0, dl, VT);
25163
25164 // Only multiply lo/hi halves that aren't known to be zero.
25165 SDValue AloBlo = Zero;
25166 if (!ALoIsZero && !BLoIsZero)
25167 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
25168
25169 SDValue AloBhi = Zero;
25170 if (!ALoIsZero && !BHiIsZero) {
25171 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
25172 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
25173 }
25174
25175 SDValue AhiBlo = Zero;
25176 if (!AHiIsZero && !BLoIsZero) {
25177 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
25178 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
25179 }
25180
25181 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
25182 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
25183
25184 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
25185}
25186
25187static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
25188 SelectionDAG &DAG) {
25189 SDLoc dl(Op);
25190 MVT VT = Op.getSimpleValueType();
25191 bool IsSigned = Op->getOpcode() == ISD::MULHS;
25192 unsigned NumElts = VT.getVectorNumElements();
25193 SDValue A = Op.getOperand(0);
25194 SDValue B = Op.getOperand(1);
25195
25196 // Decompose 256-bit ops into 128-bit ops.
25197 if (VT.is256BitVector() && !Subtarget.hasInt256())
25198 return split256IntArith(Op, DAG);
25199
25200 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
25201 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25203, __PRETTY_FUNCTION__))
25202 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25203, __PRETTY_FUNCTION__))
25203 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25203, __PRETTY_FUNCTION__))
;
25204
25205 // PMULxD operations multiply each even value (starting at 0) of LHS with
25206 // the related value of RHS and produce a widen result.
25207 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
25208 // => <2 x i64> <ae|cg>
25209 //
25210 // In other word, to have all the results, we need to perform two PMULxD:
25211 // 1. one with the even values.
25212 // 2. one with the odd values.
25213 // To achieve #2, with need to place the odd values at an even position.
25214 //
25215 // Place the odd value at an even position (basically, shift all values 1
25216 // step to the left):
25217 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
25218 9, -1, 11, -1, 13, -1, 15, -1};
25219 // <a|b|c|d> => <b|undef|d|undef>
25220 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
25221 makeArrayRef(&Mask[0], NumElts));
25222 // <e|f|g|h> => <f|undef|h|undef>
25223 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
25224 makeArrayRef(&Mask[0], NumElts));
25225
25226 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
25227 // ints.
25228 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
25229 unsigned Opcode =
25230 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
25231 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
25232 // => <2 x i64> <ae|cg>
25233 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
25234 DAG.getBitcast(MulVT, A),
25235 DAG.getBitcast(MulVT, B)));
25236 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
25237 // => <2 x i64> <bf|dh>
25238 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
25239 DAG.getBitcast(MulVT, Odd0),
25240 DAG.getBitcast(MulVT, Odd1)));
25241
25242 // Shuffle it back into the right order.
25243 SmallVector<int, 16> ShufMask(NumElts);
25244 for (int i = 0; i != (int)NumElts; ++i)
25245 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
25246
25247 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
25248
25249 // If we have a signed multiply but no PMULDQ fix up the result of an
25250 // unsigned multiply.
25251 if (IsSigned && !Subtarget.hasSSE41()) {
25252 SDValue Zero = DAG.getConstant(0, dl, VT);
25253 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
25254 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
25255 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
25256 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
25257
25258 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
25259 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
25260 }
25261
25262 return Res;
25263 }
25264
25265 // Only i8 vectors should need custom lowering after this.
25266 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25268, __PRETTY_FUNCTION__))
25267 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25268, __PRETTY_FUNCTION__))
25268 "Unsupported vector type")(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25268, __PRETTY_FUNCTION__))
;
25269
25270 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
25271 // logical shift down the upper half and pack back to i8.
25272
25273 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
25274 // and then ashr/lshr the upper bits down to the lower bits before multiply.
25275 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
25276
25277 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
25278 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
25279 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
25280 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
25281 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
25282 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
25283 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
25284 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
25285 }
25286
25287 // For signed 512-bit vectors, split into 256-bit vectors to allow the
25288 // sign-extension to occur.
25289 if (VT == MVT::v64i8 && IsSigned)
25290 return split512IntArith(Op, DAG);
25291
25292 // Signed AVX2 implementation - extend xmm subvectors to ymm.
25293 if (VT == MVT::v32i8 && IsSigned) {
25294 MVT ExVT = MVT::v16i16;
25295 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
25296 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
25297 SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
25298 SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
25299 ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
25300 BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
25301 AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
25302 BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
25303 SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
25304 SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
25305 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
25306 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
25307
25308 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
25309 // Shuffle lowering should turn this into PACKUS+PERMQ
25310 Lo = DAG.getBitcast(VT, Lo);
25311 Hi = DAG.getBitcast(VT, Hi);
25312 return DAG.getVectorShuffle(VT, dl, Lo, Hi,
25313 { 0, 2, 4, 6, 8, 10, 12, 14,
25314 16, 18, 20, 22, 24, 26, 28, 30,
25315 32, 34, 36, 38, 40, 42, 44, 46,
25316 48, 50, 52, 54, 56, 58, 60, 62});
25317 }
25318
25319 // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
25320 // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
25321 // shift the results and pack the half lane results back together.
25322
25323 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
25324
25325 static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
25326 -1, -1, -1, -1, -1, -1, -1, -1};
25327
25328 // Extract the lo parts and zero/sign extend to i16.
25329 // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
25330 // shifts to sign extend. Using unpack for unsigned only requires an xor to
25331 // create zeros and a copy due to tied registers contraints pre-avx. But using
25332 // zero_extend_vector_inreg would require an additional pshufd for the high
25333 // part.
25334
25335 SDValue ALo, AHi;
25336 if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
25337 ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
25338
25339 AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
25340 AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
25341 } else if (IsSigned) {
25342 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
25343 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
25344
25345 ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
25346 AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
25347 } else {
25348 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
25349 DAG.getConstant(0, dl, VT)));
25350 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
25351 DAG.getConstant(0, dl, VT)));
25352 }
25353
25354 SDValue BLo, BHi;
25355 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
25356 // If the LHS is a constant, manually unpackl/unpackh and extend.
25357 SmallVector<SDValue, 16> LoOps, HiOps;
25358 for (unsigned i = 0; i != NumElts; i += 16) {
25359 for (unsigned j = 0; j != 8; ++j) {
25360 SDValue LoOp = B.getOperand(i + j);
25361 SDValue HiOp = B.getOperand(i + j + 8);
25362
25363 if (IsSigned) {
25364 LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
25365 HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
25366 } else {
25367 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
25368 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
25369 }
25370
25371 LoOps.push_back(LoOp);
25372 HiOps.push_back(HiOp);
25373 }
25374 }
25375
25376 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
25377 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
25378 } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
25379 BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
25380
25381 BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
25382 BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
25383 } else if (IsSigned) {
25384 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
25385 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
25386
25387 BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
25388 BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
25389 } else {
25390 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
25391 DAG.getConstant(0, dl, VT)));
25392 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
25393 DAG.getConstant(0, dl, VT)));
25394 }
25395
25396 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
25397 // pack back to vXi8.
25398 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
25399 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
25400 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
25401 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
25402
25403 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
25404 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
25405}
25406
25407SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
25408 assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25408, __PRETTY_FUNCTION__))
;
25409 EVT VT = Op.getValueType();
25410 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25411, __PRETTY_FUNCTION__))
25411 "Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25411, __PRETTY_FUNCTION__))
;
25412
25413 RTLIB::Libcall LC;
25414 bool isSigned;
25415 switch (Op->getOpcode()) {
25416 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25416)
;
25417 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
25418 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
25419 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
25420 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
25421 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
25422 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
25423 }
25424
25425 SDLoc dl(Op);
25426 SDValue InChain = DAG.getEntryNode();
25427
25428 TargetLowering::ArgListTy Args;
25429 TargetLowering::ArgListEntry Entry;
25430 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
25431 EVT ArgVT = Op->getOperand(i).getValueType();
25432 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25433, __PRETTY_FUNCTION__))
25433 "Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25433, __PRETTY_FUNCTION__))
;
25434 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
25435 Entry.Node = StackPtr;
25436 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
25437 MachinePointerInfo(), /* Alignment = */ 16);
25438 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25439 Entry.Ty = PointerType::get(ArgTy,0);
25440 Entry.IsSExt = false;
25441 Entry.IsZExt = false;
25442 Args.push_back(Entry);
25443 }
25444
25445 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
25446 getPointerTy(DAG.getDataLayout()));
25447
25448 TargetLowering::CallLoweringInfo CLI(DAG);
25449 CLI.setDebugLoc(dl)
25450 .setChain(InChain)
25451 .setLibCallee(
25452 getLibcallCallingConv(LC),
25453 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
25454 std::move(Args))
25455 .setInRegister()
25456 .setSExtResult(isSigned)
25457 .setZExtResult(!isSigned);
25458
25459 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
25460 return DAG.getBitcast(VT, CallInfo.first);
25461}
25462
25463// Return true if the required (according to Opcode) shift-imm form is natively
25464// supported by the Subtarget
25465static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
25466 unsigned Opcode) {
25467 if (VT.getScalarSizeInBits() < 16)
25468 return false;
25469
25470 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
25471 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
25472 return true;
25473
25474 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
25475 (VT.is256BitVector() && Subtarget.hasInt256());
25476
25477 bool AShift = LShift && (Subtarget.hasAVX512() ||
25478 (VT != MVT::v2i64 && VT != MVT::v4i64));
25479 return (Opcode == ISD::SRA) ? AShift : LShift;
25480}
25481
25482// The shift amount is a variable, but it is the same for all vector lanes.
25483// These instructions are defined together with shift-immediate.
25484static
25485bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
25486 unsigned Opcode) {
25487 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
25488}
25489
25490// Return true if the required (according to Opcode) variable-shift form is
25491// natively supported by the Subtarget
25492static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
25493 unsigned Opcode) {
25494
25495 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
25496 return false;
25497
25498 // vXi16 supported only on AVX-512, BWI
25499 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
25500 return false;
25501
25502 if (Subtarget.hasAVX512())
25503 return true;
25504
25505 bool LShift = VT.is128BitVector() || VT.is256BitVector();
25506 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
25507 return (Opcode == ISD::SRA) ? AShift : LShift;
25508}
25509
25510static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
25511 const X86Subtarget &Subtarget) {
25512 MVT VT = Op.getSimpleValueType();
25513 SDLoc dl(Op);
25514 SDValue R = Op.getOperand(0);
25515 SDValue Amt = Op.getOperand(1);
25516 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
25517
25518 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
25519 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25519, __PRETTY_FUNCTION__))
;
25520 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
25521 SDValue Ex = DAG.getBitcast(ExVT, R);
25522
25523 // ashr(R, 63) === cmp_slt(R, 0)
25524 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
25525 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25526, __PRETTY_FUNCTION__))
25526 "Unsupported PCMPGT op")(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25526, __PRETTY_FUNCTION__))
;
25527 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
25528 }
25529
25530 if (ShiftAmt >= 32) {
25531 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
25532 SDValue Upper =
25533 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
25534 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
25535 ShiftAmt - 32, DAG);
25536 if (VT == MVT::v2i64)
25537 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
25538 if (VT == MVT::v4i64)
25539 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
25540 {9, 1, 11, 3, 13, 5, 15, 7});
25541 } else {
25542 // SRA upper i32, SRL whole i64 and select lower i32.
25543 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
25544 ShiftAmt, DAG);
25545 SDValue Lower =
25546 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
25547 Lower = DAG.getBitcast(ExVT, Lower);
25548 if (VT == MVT::v2i64)
25549 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
25550 if (VT == MVT::v4i64)
25551 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
25552 {8, 1, 10, 3, 12, 5, 14, 7});
25553 }
25554 return DAG.getBitcast(VT, Ex);
25555 };
25556
25557 // Optimize shl/srl/sra with constant shift amount.
25558 APInt APIntShiftAmt;
25559 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
25560 return SDValue();
25561
25562 // If the shift amount is out of range, return undef.
25563 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
25564 return DAG.getUNDEF(VT);
25565
25566 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
25567
25568 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
25569 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
25570
25571 // i64 SRA needs to be performed as partial shifts.
25572 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
25573 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
25574 Op.getOpcode() == ISD::SRA)
25575 return ArithmeticShiftRight64(ShiftAmt);
25576
25577 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
25578 VT == MVT::v64i8) {
25579 unsigned NumElts = VT.getVectorNumElements();
25580 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
25581
25582 // Simple i8 add case
25583 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
25584 return DAG.getNode(ISD::ADD, dl, VT, R, R);
25585
25586 // ashr(R, 7) === cmp_slt(R, 0)
25587 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
25588 SDValue Zeros = DAG.getConstant(0, dl, VT);
25589 if (VT.is512BitVector()) {
25590 assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25590, __PRETTY_FUNCTION__))
;
25591 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
25592 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
25593 }
25594 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
25595 }
25596
25597 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
25598 if (VT == MVT::v16i8 && Subtarget.hasXOP())
25599 return SDValue();
25600
25601 if (Op.getOpcode() == ISD::SHL) {
25602 // Make a large shift.
25603 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
25604 ShiftAmt, DAG);
25605 SHL = DAG.getBitcast(VT, SHL);
25606 // Zero out the rightmost bits.
25607 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
25608 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
25609 }
25610 if (Op.getOpcode() == ISD::SRL) {
25611 // Make a large shift.
25612 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
25613 ShiftAmt, DAG);
25614 SRL = DAG.getBitcast(VT, SRL);
25615 // Zero out the leftmost bits.
25616 return DAG.getNode(ISD::AND, dl, VT, SRL,
25617 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
25618 }
25619 if (Op.getOpcode() == ISD::SRA) {
25620 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
25621 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
25622
25623 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
25624 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
25625 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
25626 return Res;
25627 }
25628 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25628)
;
25629 }
25630
25631 return SDValue();
25632}
25633
25634static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
25635 const X86Subtarget &Subtarget) {
25636 MVT VT = Op.getSimpleValueType();
25637 SDLoc dl(Op);
25638 SDValue R = Op.getOperand(0);
25639 SDValue Amt = Op.getOperand(1);
25640 unsigned Opcode = Op.getOpcode();
25641 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
25642 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
25643
25644 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
25645 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
25646 MVT EltVT = VT.getVectorElementType();
25647 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25647, __PRETTY_FUNCTION__))
;
25648 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
25649 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
25650 else if (EltVT.bitsLT(MVT::i32))
25651 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
25652
25653 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
25654 }
25655
25656 // vXi8 shifts - shift as v8i16 + mask result.
25657 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
25658 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
25659 VT == MVT::v64i8) &&
25660 !Subtarget.hasXOP()) {
25661 unsigned NumElts = VT.getVectorNumElements();
25662 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
25663 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
25664 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
25665 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
25666 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
25667
25668 // Create the mask using vXi16 shifts. For shift-rights we need to move
25669 // the upper byte down before splatting the vXi8 mask.
25670 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
25671 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
25672 BaseShAmt, Subtarget, DAG);
25673 if (Opcode != ISD::SHL)
25674 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
25675 8, DAG);
25676 BitMask = DAG.getBitcast(VT, BitMask);
25677 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
25678 SmallVector<int, 64>(NumElts, 0));
25679
25680 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
25681 DAG.getBitcast(ExtVT, R), BaseShAmt,
25682 Subtarget, DAG);
25683 Res = DAG.getBitcast(VT, Res);
25684 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
25685
25686 if (Opcode == ISD::SRA) {
25687 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
25688 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
25689 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
25690 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
25691 BaseShAmt, Subtarget, DAG);
25692 SignMask = DAG.getBitcast(VT, SignMask);
25693 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
25694 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
25695 }
25696 return Res;
25697 }
25698 }
25699 }
25700
25701 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
25702 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
25703 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
25704 Amt = Amt.getOperand(0);
25705 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
25706 std::vector<SDValue> Vals(Ratio);
25707 for (unsigned i = 0; i != Ratio; ++i)
25708 Vals[i] = Amt.getOperand(i);
25709 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
25710 for (unsigned j = 0; j != Ratio; ++j)
25711 if (Vals[j] != Amt.getOperand(i + j))
25712 return SDValue();
25713 }
25714
25715 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
25716 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
25717 }
25718 return SDValue();
25719}
25720
25721// Convert a shift/rotate left amount to a multiplication scale factor.
25722static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
25723 const X86Subtarget &Subtarget,
25724 SelectionDAG &DAG) {
25725 MVT VT = Amt.getSimpleValueType();
25726 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
25727 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
25728 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
25729 return SDValue();
25730
25731 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
25732 SmallVector<SDValue, 8> Elts;
25733 MVT SVT = VT.getVectorElementType();
25734 unsigned SVTBits = SVT.getSizeInBits();
25735 APInt One(SVTBits, 1);
25736 unsigned NumElems = VT.getVectorNumElements();
25737
25738 for (unsigned i = 0; i != NumElems; ++i) {
25739 SDValue Op = Amt->getOperand(i);
25740 if (Op->isUndef()) {
25741 Elts.push_back(Op);
25742 continue;
25743 }
25744
25745 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
25746 APInt C(SVTBits, ND->getZExtValue());
25747 uint64_t ShAmt = C.getZExtValue();
25748 if (ShAmt >= SVTBits) {
25749 Elts.push_back(DAG.getUNDEF(SVT));
25750 continue;
25751 }
25752 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
25753 }
25754 return DAG.getBuildVector(VT, dl, Elts);
25755 }
25756
25757 // If the target doesn't support variable shifts, use either FP conversion
25758 // or integer multiplication to avoid shifting each element individually.
25759 if (VT == MVT::v4i32) {
25760 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
25761 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
25762 DAG.getConstant(0x3f800000U, dl, VT));
25763 Amt = DAG.getBitcast(MVT::v4f32, Amt);
25764 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
25765 }
25766
25767 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
25768 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
25769 SDValue Z = DAG.getConstant(0, dl, VT);
25770 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
25771 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
25772 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
25773 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
25774 if (Subtarget.hasSSE41())
25775 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
25776
25777 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
25778 DAG.getBitcast(VT, Hi),
25779 {0, 2, 4, 6, 8, 10, 12, 14});
25780 }
25781
25782 return SDValue();
25783}
25784
25785static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
25786 SelectionDAG &DAG) {
25787 MVT VT = Op.getSimpleValueType();
25788 SDLoc dl(Op);
25789 SDValue R = Op.getOperand(0);
25790 SDValue Amt = Op.getOperand(1);
25791 unsigned EltSizeInBits = VT.getScalarSizeInBits();
25792 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
25793
25794 unsigned Opc = Op.getOpcode();
25795 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
25796 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
25797
25798 assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25798, __PRETTY_FUNCTION__))
;
25799 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25799, __PRETTY_FUNCTION__))
;
25800
25801 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
25802 return V;
25803
25804 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
25805 return V;
25806
25807 if (SupportedVectorVarShift(VT, Subtarget, Opc))
25808 return Op;
25809
25810 // XOP has 128-bit variable logical/arithmetic shifts.
25811 // +ve/-ve Amt = shift left/right.
25812 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
25813 VT == MVT::v8i16 || VT == MVT::v16i8)) {
25814 if (Opc == ISD::SRL || Opc == ISD::SRA) {
25815 SDValue Zero = DAG.getConstant(0, dl, VT);
25816 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
25817 }
25818 if (Opc == ISD::SHL || Opc == ISD::SRL)
25819 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
25820 if (Opc == ISD::SRA)
25821 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
25822 }
25823
25824 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
25825 // shifts per-lane and then shuffle the partial results back together.
25826 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
25827 // Splat the shift amounts so the scalar shifts above will catch it.
25828 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
25829 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
25830 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
25831 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
25832 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
25833 }
25834
25835 // i64 vector arithmetic shift can be emulated with the transform:
25836 // M = lshr(SIGN_MASK, Amt)
25837 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
25838 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
25839 Opc == ISD::SRA) {
25840 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
25841 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
25842 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
25843 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
25844 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
25845 return R;
25846 }
25847
25848 // If possible, lower this shift as a sequence of two shifts by
25849 // constant plus a BLENDing shuffle instead of scalarizing it.
25850 // Example:
25851 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
25852 //
25853 // Could be rewritten as:
25854 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
25855 //
25856 // The advantage is that the two shifts from the example would be
25857 // lowered as X86ISD::VSRLI nodes in parallel before blending.
25858 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
25859 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
25860 SDValue Amt1, Amt2;
25861 unsigned NumElts = VT.getVectorNumElements();
25862 SmallVector<int, 8> ShuffleMask;
25863 for (unsigned i = 0; i != NumElts; ++i) {
25864 SDValue A = Amt->getOperand(i);
25865 if (A.isUndef()) {
25866 ShuffleMask.push_back(SM_SentinelUndef);
25867 continue;
25868 }
25869 if (!Amt1 || Amt1 == A) {
25870 ShuffleMask.push_back(i);
25871 Amt1 = A;
25872 continue;
25873 }
25874 if (!Amt2 || Amt2 == A) {
25875 ShuffleMask.push_back(i + NumElts);
25876 Amt2 = A;
25877 continue;
25878 }
25879 break;
25880 }
25881
25882 // Only perform this blend if we can perform it without loading a mask.
25883 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
25884 (VT != MVT::v16i16 ||
25885 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
25886 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
25887 canWidenShuffleElements(ShuffleMask))) {
25888 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
25889 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
25890 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
25891 Cst2->getAPIntValue().ult(EltSizeInBits)) {
25892 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
25893 Cst1->getZExtValue(), DAG);
25894 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
25895 Cst2->getZExtValue(), DAG);
25896 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
25897 }
25898 }
25899 }
25900
25901 // If possible, lower this packed shift into a vector multiply instead of
25902 // expanding it into a sequence of scalar shifts.
25903 if (Opc == ISD::SHL)
25904 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
25905 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
25906
25907 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
25908 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
25909 if (Opc == ISD::SRL && ConstantAmt &&
25910 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
25911 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
25912 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
25913 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
25914 SDValue Zero = DAG.getConstant(0, dl, VT);
25915 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
25916 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
25917 return DAG.getSelect(dl, VT, ZAmt, R, Res);
25918 }
25919 }
25920
25921 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
25922 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
25923 // TODO: Special case handling for shift by 0/1, really we can afford either
25924 // of these cases in pre-SSE41/XOP/AVX512 but not both.
25925 if (Opc == ISD::SRA && ConstantAmt &&
25926 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
25927 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
25928 !Subtarget.hasAVX512()) ||
25929 DAG.isKnownNeverZero(Amt))) {
25930 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
25931 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
25932 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
25933 SDValue Amt0 =
25934 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
25935 SDValue Amt1 =
25936 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
25937 SDValue Sra1 =
25938 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
25939 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
25940 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
25941 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
25942 }
25943 }
25944
25945 // v4i32 Non Uniform Shifts.
25946 // If the shift amount is constant we can shift each lane using the SSE2
25947 // immediate shifts, else we need to zero-extend each lane to the lower i64
25948 // and shift using the SSE2 variable shifts.
25949 // The separate results can then be blended together.
25950 if (VT == MVT::v4i32) {
25951 SDValue Amt0, Amt1, Amt2, Amt3;
25952 if (ConstantAmt) {
25953 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
25954 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
25955 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
25956 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
25957 } else {
25958 // The SSE2 shifts use the lower i64 as the same shift amount for
25959 // all lanes and the upper i64 is ignored. On AVX we're better off
25960 // just zero-extending, but for SSE just duplicating the top 16-bits is
25961 // cheaper and has the same effect for out of range values.
25962 if (Subtarget.hasAVX()) {
25963 SDValue Z = DAG.getConstant(0, dl, VT);
25964 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
25965 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
25966 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
25967 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
25968 } else {
25969 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
25970 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
25971 {4, 5, 6, 7, -1, -1, -1, -1});
25972 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
25973 {0, 1, 1, 1, -1, -1, -1, -1});
25974 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
25975 {2, 3, 3, 3, -1, -1, -1, -1});
25976 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
25977 {0, 1, 1, 1, -1, -1, -1, -1});
25978 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
25979 {2, 3, 3, 3, -1, -1, -1, -1});
25980 }
25981 }
25982
25983 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
25984 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
25985 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
25986 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
25987 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
25988
25989 // Merge the shifted lane results optimally with/without PBLENDW.
25990 // TODO - ideally shuffle combining would handle this.
25991 if (Subtarget.hasSSE41()) {
25992 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
25993 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
25994 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
25995 }
25996 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
25997 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
25998 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
25999 }
26000
26001 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
26002 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
26003 // make the existing SSE solution better.
26004 // NOTE: We honor prefered vector width before promoting to 512-bits.
26005 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
26006 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
26007 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
26008 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
26009 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
26010 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26011, __PRETTY_FUNCTION__))
26011 "Unexpected vector type")(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26011, __PRETTY_FUNCTION__))
;
26012 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
26013 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
26014 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26015 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
26016 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
26017 return DAG.getNode(ISD::TRUNCATE, dl, VT,
26018 DAG.getNode(Opc, dl, ExtVT, R, Amt));
26019 }
26020
26021 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
26022 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
26023 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
26024 (VT == MVT::v16i8 || VT == MVT::v64i8 ||
26025 (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
26026 !Subtarget.hasXOP()) {
26027 int NumElts = VT.getVectorNumElements();
26028 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
26029
26030 // Extend constant shift amount to vXi16 (it doesn't matter if the type
26031 // isn't legal).
26032 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
26033 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
26034 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
26035 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
26036 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26037, __PRETTY_FUNCTION__))
26037 "Constant build vector expected")((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26037, __PRETTY_FUNCTION__))
;
26038
26039 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
26040 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
26041 : DAG.getZExtOrTrunc(R, dl, ExVT);
26042 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
26043 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
26044 return DAG.getZExtOrTrunc(R, dl, VT);
26045 }
26046
26047 SmallVector<SDValue, 16> LoAmt, HiAmt;
26048 for (int i = 0; i != NumElts; i += 16) {
26049 for (int j = 0; j != 8; ++j) {
26050 LoAmt.push_back(Amt.getOperand(i + j));
26051 HiAmt.push_back(Amt.getOperand(i + j + 8));
26052 }
26053 }
26054
26055 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
26056 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
26057 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
26058
26059 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
26060 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
26061 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
26062 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
26063 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
26064 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
26065 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
26066 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
26067 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
26068 }
26069
26070 if (VT == MVT::v16i8 ||
26071 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
26072 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
26073 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
26074
26075 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
26076 if (VT.is512BitVector()) {
26077 // On AVX512BW targets we make use of the fact that VSELECT lowers
26078 // to a masked blend which selects bytes based just on the sign bit
26079 // extracted to a mask.
26080 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26081 V0 = DAG.getBitcast(VT, V0);
26082 V1 = DAG.getBitcast(VT, V1);
26083 Sel = DAG.getBitcast(VT, Sel);
26084 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
26085 ISD::SETGT);
26086 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
26087 } else if (Subtarget.hasSSE41()) {
26088 // On SSE41 targets we make use of the fact that VSELECT lowers
26089 // to PBLENDVB which selects bytes based just on the sign bit.
26090 V0 = DAG.getBitcast(VT, V0);
26091 V1 = DAG.getBitcast(VT, V1);
26092 Sel = DAG.getBitcast(VT, Sel);
26093 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
26094 }
26095 // On pre-SSE41 targets we test for the sign bit by comparing to
26096 // zero - a negative value will set all bits of the lanes to true
26097 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
26098 SDValue Z = DAG.getConstant(0, dl, SelVT);
26099 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
26100 return DAG.getSelect(dl, SelVT, C, V0, V1);
26101 };
26102
26103 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
26104 // We can safely do this using i16 shifts as we're only interested in
26105 // the 3 lower bits of each byte.
26106 Amt = DAG.getBitcast(ExtVT, Amt);
26107 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
26108 Amt = DAG.getBitcast(VT, Amt);
26109
26110 if (Opc == ISD::SHL || Opc == ISD::SRL) {
26111 // r = VSELECT(r, shift(r, 4), a);
26112 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
26113 R = SignBitSelect(VT, Amt, M, R);
26114
26115 // a += a
26116 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
26117
26118 // r = VSELECT(r, shift(r, 2), a);
26119 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
26120 R = SignBitSelect(VT, Amt, M, R);
26121
26122 // a += a
26123 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
26124
26125 // return VSELECT(r, shift(r, 1), a);
26126 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
26127 R = SignBitSelect(VT, Amt, M, R);
26128 return R;
26129 }
26130
26131 if (Opc == ISD::SRA) {
26132 // For SRA we need to unpack each byte to the higher byte of a i16 vector
26133 // so we can correctly sign extend. We don't care what happens to the
26134 // lower byte.
26135 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
26136 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
26137 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
26138 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
26139 ALo = DAG.getBitcast(ExtVT, ALo);
26140 AHi = DAG.getBitcast(ExtVT, AHi);
26141 RLo = DAG.getBitcast(ExtVT, RLo);
26142 RHi = DAG.getBitcast(ExtVT, RHi);
26143
26144 // r = VSELECT(r, shift(r, 4), a);
26145 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
26146 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
26147 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
26148 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
26149
26150 // a += a
26151 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
26152 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
26153
26154 // r = VSELECT(r, shift(r, 2), a);
26155 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
26156 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
26157 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
26158 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
26159
26160 // a += a
26161 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
26162 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
26163
26164 // r = VSELECT(r, shift(r, 1), a);
26165 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
26166 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
26167 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
26168 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
26169
26170 // Logical shift the result back to the lower byte, leaving a zero upper
26171 // byte meaning that we can safely pack with PACKUSWB.
26172 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
26173 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
26174 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
26175 }
26176 }
26177
26178 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
26179 MVT ExtVT = MVT::v8i32;
26180 SDValue Z = DAG.getConstant(0, dl, VT);
26181 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
26182 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
26183 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
26184 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
26185 ALo = DAG.getBitcast(ExtVT, ALo);
26186 AHi = DAG.getBitcast(ExtVT, AHi);
26187 RLo = DAG.getBitcast(ExtVT, RLo);
26188 RHi = DAG.getBitcast(ExtVT, RHi);
26189 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
26190 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
26191 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
26192 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
26193 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
26194 }
26195
26196 if (VT == MVT::v8i16) {
26197 // If we have a constant shift amount, the non-SSE41 path is best as
26198 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
26199 bool UseSSE41 = Subtarget.hasSSE41() &&
26200 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
26201
26202 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
26203 // On SSE41 targets we make use of the fact that VSELECT lowers
26204 // to PBLENDVB which selects bytes based just on the sign bit.
26205 if (UseSSE41) {
26206 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
26207 V0 = DAG.getBitcast(ExtVT, V0);
26208 V1 = DAG.getBitcast(ExtVT, V1);
26209 Sel = DAG.getBitcast(ExtVT, Sel);
26210 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
26211 }
26212 // On pre-SSE41 targets we splat the sign bit - a negative value will
26213 // set all bits of the lanes to true and VSELECT uses that in
26214 // its OR(AND(V0,C),AND(V1,~C)) lowering.
26215 SDValue C =
26216 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
26217 return DAG.getSelect(dl, VT, C, V0, V1);
26218 };
26219
26220 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
26221 if (UseSSE41) {
26222 // On SSE41 targets we need to replicate the shift mask in both
26223 // bytes for PBLENDVB.
26224 Amt = DAG.getNode(
26225 ISD::OR, dl, VT,
26226 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
26227 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
26228 } else {
26229 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
26230 }
26231
26232 // r = VSELECT(r, shift(r, 8), a);
26233 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
26234 R = SignBitSelect(Amt, M, R);
26235
26236 // a += a
26237 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
26238
26239 // r = VSELECT(r, shift(r, 4), a);
26240 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
26241 R = SignBitSelect(Amt, M, R);
26242
26243 // a += a
26244 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
26245
26246 // r = VSELECT(r, shift(r, 2), a);
26247 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
26248 R = SignBitSelect(Amt, M, R);
26249
26250 // a += a
26251 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
26252
26253 // return VSELECT(r, shift(r, 1), a);
26254 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
26255 R = SignBitSelect(Amt, M, R);
26256 return R;
26257 }
26258
26259 // Decompose 256-bit shifts into 128-bit shifts.
26260 if (VT.is256BitVector())
26261 return split256IntArith(Op, DAG);
26262
26263 return SDValue();
26264}
26265
26266static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 MVT VT = Op.getSimpleValueType();
26269 assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26269, __PRETTY_FUNCTION__))
;
26270
26271 SDLoc DL(Op);
26272 SDValue R = Op.getOperand(0);
26273 SDValue Amt = Op.getOperand(1);
26274 unsigned Opcode = Op.getOpcode();
26275 unsigned EltSizeInBits = VT.getScalarSizeInBits();
26276 int NumElts = VT.getVectorNumElements();
26277
26278 // Check for constant splat rotation amount.
26279 APInt UndefElts;
26280 SmallVector<APInt, 32> EltBits;
26281 int CstSplatIndex = -1;
26282 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
26283 for (int i = 0; i != NumElts; ++i)
26284 if (!UndefElts[i]) {
26285 if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
26286 CstSplatIndex = i;
26287 continue;
26288 }
26289 CstSplatIndex = -1;
26290 break;
26291 }
26292
26293 // AVX512 implicitly uses modulo rotation amounts.
26294 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
26295 // Attempt to rotate by immediate.
26296 if (0 <= CstSplatIndex) {
26297 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
26298 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
26299 return DAG.getNode(Op, DL, VT, R,
26300 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
26301 }
26302
26303 // Else, fall-back on VPROLV/VPRORV.
26304 return Op;
26305 }
26306
26307 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(((Opcode == ISD::ROTL) && "Only ROTL supported") ? static_cast
<void> (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26307, __PRETTY_FUNCTION__))
;
26308
26309 // XOP has 128-bit vector variable + immediate rotates.
26310 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
26311 // XOP implicitly uses modulo rotation amounts.
26312 if (Subtarget.hasXOP()) {
26313 if (VT.is256BitVector())
26314 return split256IntArith(Op, DAG);
26315 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26315, __PRETTY_FUNCTION__))
;
26316
26317 // Attempt to rotate by immediate.
26318 if (0 <= CstSplatIndex) {
26319 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
26320 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
26321 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
26322 }
26323
26324 // Use general rotate by variable (per-element).
26325 return Op;
26326 }
26327
26328 // Split 256-bit integers on pre-AVX2 targets.
26329 if (VT.is256BitVector() && !Subtarget.hasAVX2())
26330 return split256IntArith(Op, DAG);
26331
26332 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26335, __PRETTY_FUNCTION__))
26333 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26335, __PRETTY_FUNCTION__))
26334 Subtarget.hasAVX2())) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26335, __PRETTY_FUNCTION__))
26335 "Only vXi32/vXi16/vXi8 vector rotates supported")(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26335, __PRETTY_FUNCTION__))
;
26336
26337 // Rotate by an uniform constant - expand back to shifts.
26338 if (0 <= CstSplatIndex)
26339 return SDValue();
26340
26341 bool IsSplatAmt = DAG.isSplatValue(Amt);
26342
26343 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
26344 // the amount bit.
26345 if (EltSizeInBits == 8 && !IsSplatAmt) {
26346 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
26347 return SDValue();
26348
26349 // We don't need ModuloAmt here as we just peek at individual bits.
26350 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26351
26352 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
26353 if (Subtarget.hasSSE41()) {
26354 // On SSE41 targets we make use of the fact that VSELECT lowers
26355 // to PBLENDVB which selects bytes based just on the sign bit.
26356 V0 = DAG.getBitcast(VT, V0);
26357 V1 = DAG.getBitcast(VT, V1);
26358 Sel = DAG.getBitcast(VT, Sel);
26359 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
26360 }
26361 // On pre-SSE41 targets we test for the sign bit by comparing to
26362 // zero - a negative value will set all bits of the lanes to true
26363 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
26364 SDValue Z = DAG.getConstant(0, DL, SelVT);
26365 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
26366 return DAG.getSelect(DL, SelVT, C, V0, V1);
26367 };
26368
26369 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
26370 // We can safely do this using i16 shifts as we're only interested in
26371 // the 3 lower bits of each byte.
26372 Amt = DAG.getBitcast(ExtVT, Amt);
26373 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
26374 Amt = DAG.getBitcast(VT, Amt);
26375
26376 // r = VSELECT(r, rot(r, 4), a);
26377 SDValue M;
26378 M = DAG.getNode(
26379 ISD::OR, DL, VT,
26380 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
26381 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
26382 R = SignBitSelect(VT, Amt, M, R);
26383
26384 // a += a
26385 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
26386
26387 // r = VSELECT(r, rot(r, 2), a);
26388 M = DAG.getNode(
26389 ISD::OR, DL, VT,
26390 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
26391 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
26392 R = SignBitSelect(VT, Amt, M, R);
26393
26394 // a += a
26395 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
26396
26397 // return VSELECT(r, rot(r, 1), a);
26398 M = DAG.getNode(
26399 ISD::OR, DL, VT,
26400 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
26401 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
26402 return SignBitSelect(VT, Amt, M, R);
26403 }
26404
26405 // ISD::ROT* uses modulo rotate amounts.
26406 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
26407 DAG.getConstant(EltSizeInBits - 1, DL, VT));
26408
26409 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
26410 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
26411 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
26412
26413 // Fallback for splats + all supported variable shifts.
26414 // Fallback for non-constants AVX2 vXi16 as well.
26415 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
26416 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
26417 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
26418 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
26419 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
26420 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
26421 }
26422
26423 // As with shifts, convert the rotation amount to a multiplication factor.
26424 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
26425 assert(Scale && "Failed to convert ROTL amount to scale")((Scale && "Failed to convert ROTL amount to scale") ?
static_cast<void> (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26425, __PRETTY_FUNCTION__))
;
26426
26427 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
26428 if (EltSizeInBits == 16) {
26429 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
26430 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
26431 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
26432 }
26433
26434 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
26435 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
26436 // that can then be OR'd with the lower 32-bits.
26437 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26437, __PRETTY_FUNCTION__))
;
26438 static const int OddMask[] = {1, -1, 3, -1};
26439 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
26440 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
26441
26442 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
26443 DAG.getBitcast(MVT::v2i64, R),
26444 DAG.getBitcast(MVT::v2i64, Scale));
26445 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
26446 DAG.getBitcast(MVT::v2i64, R13),
26447 DAG.getBitcast(MVT::v2i64, Scale13));
26448 Res02 = DAG.getBitcast(VT, Res02);
26449 Res13 = DAG.getBitcast(VT, Res13);
26450
26451 return DAG.getNode(ISD::OR, DL, VT,
26452 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
26453 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
26454}
26455
26456/// Returns true if the operand type is exactly twice the native width, and
26457/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
26458/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
26459/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
26460bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
26461 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
26462
26463 if (OpWidth == 64)
26464 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
26465 if (OpWidth == 128)
26466 return Subtarget.hasCmpxchg16b();
26467
26468 return false;
26469}
26470
26471// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
26472// TODO: In 32-bit mode, use FISTP when X87 is available?
26473bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
26474 Type *MemType = SI->getValueOperand()->getType();
26475
26476 bool NoImplicitFloatOps =
26477 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
26478 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
26479 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
26480 return false;
26481
26482 return needsCmpXchgNb(MemType);
26483}
26484
26485// Note: this turns large loads into lock cmpxchg8b/16b.
26486// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
26487TargetLowering::AtomicExpansionKind
26488X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
26489 Type *MemType = LI->getType();
26490
26491 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
26492 // can use movq to do the load. If we have X87 we can load into an 80-bit
26493 // X87 register and store it to a stack temporary.
26494 bool NoImplicitFloatOps =
26495 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
26496 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
26497 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
26498 (Subtarget.hasSSE2() || Subtarget.hasX87()))
26499 return AtomicExpansionKind::None;
26500
26501 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
26502 : AtomicExpansionKind::None;
26503}
26504
26505TargetLowering::AtomicExpansionKind
26506X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
26507 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
26508 Type *MemType = AI->getType();
26509
26510 // If the operand is too big, we must see if cmpxchg8/16b is available
26511 // and default to library calls otherwise.
26512 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
26513 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
26514 : AtomicExpansionKind::None;
26515 }
26516
26517 AtomicRMWInst::BinOp Op = AI->getOperation();
26518 switch (Op) {
26519 default:
26520 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26520)
;
26521 case AtomicRMWInst::Xchg:
26522 case AtomicRMWInst::Add:
26523 case AtomicRMWInst::Sub:
26524 // It's better to use xadd, xsub or xchg for these in all cases.
26525 return AtomicExpansionKind::None;
26526 case AtomicRMWInst::Or:
26527 case AtomicRMWInst::And:
26528 case AtomicRMWInst::Xor:
26529 // If the atomicrmw's result isn't actually used, we can just add a "lock"
26530 // prefix to a normal instruction for these operations.
26531 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
26532 : AtomicExpansionKind::None;
26533 case AtomicRMWInst::Nand:
26534 case AtomicRMWInst::Max:
26535 case AtomicRMWInst::Min:
26536 case AtomicRMWInst::UMax:
26537 case AtomicRMWInst::UMin:
26538 case AtomicRMWInst::FAdd:
26539 case AtomicRMWInst::FSub:
26540 // These always require a non-trivial set of data operations on x86. We must
26541 // use a cmpxchg loop.
26542 return AtomicExpansionKind::CmpXChg;
26543 }
26544}
26545
26546LoadInst *
26547X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
26548 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
26549 Type *MemType = AI->getType();
26550 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
26551 // there is no benefit in turning such RMWs into loads, and it is actually
26552 // harmful as it introduces a mfence.
26553 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
26554 return nullptr;
26555
26556 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
26557 // lowering available in lowerAtomicArith.
26558 // TODO: push more cases through this path.
26559 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
26560 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
26561 AI->use_empty())
26562 return nullptr;
26563
26564 auto Builder = IRBuilder<>(AI);
26565 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26566 auto SSID = AI->getSyncScopeID();
26567 // We must restrict the ordering to avoid generating loads with Release or
26568 // ReleaseAcquire orderings.
26569 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
26570
26571 // Before the load we need a fence. Here is an example lifted from
26572 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
26573 // is required:
26574 // Thread 0:
26575 // x.store(1, relaxed);
26576 // r1 = y.fetch_add(0, release);
26577 // Thread 1:
26578 // y.fetch_add(42, acquire);
26579 // r2 = x.load(relaxed);
26580 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
26581 // lowered to just a load without a fence. A mfence flushes the store buffer,
26582 // making the optimization clearly correct.
26583 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
26584 // otherwise, we might be able to be more aggressive on relaxed idempotent
26585 // rmw. In practice, they do not look useful, so we don't try to be
26586 // especially clever.
26587 if (SSID == SyncScope::SingleThread)
26588 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
26589 // the IR level, so we must wrap it in an intrinsic.
26590 return nullptr;
26591
26592 if (!Subtarget.hasMFence())
26593 // FIXME: it might make sense to use a locked operation here but on a
26594 // different cache-line to prevent cache-line bouncing. In practice it
26595 // is probably a small win, and x86 processors without mfence are rare
26596 // enough that we do not bother.
26597 return nullptr;
26598
26599 Function *MFence =
26600 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
26601 Builder.CreateCall(MFence, {});
26602
26603 // Finally we can emit the atomic load.
26604 LoadInst *Loaded =
26605 Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
26606 AI->getType()->getPrimitiveSizeInBits());
26607 Loaded->setAtomic(Order, SSID);
26608 AI->replaceAllUsesWith(Loaded);
26609 AI->eraseFromParent();
26610 return Loaded;
26611}
26612
26613bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
26614 if (!SI.isUnordered())
26615 return false;
26616 return ExperimentalUnorderedISEL;
26617}
26618bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
26619 if (!LI.isUnordered())
26620 return false;
26621 return ExperimentalUnorderedISEL;
26622}
26623
26624
26625/// Emit a locked operation on a stack location which does not change any
26626/// memory location, but does involve a lock prefix. Location is chosen to be
26627/// a) very likely accessed only by a single thread to minimize cache traffic,
26628/// and b) definitely dereferenceable. Returns the new Chain result.
26629static SDValue emitLockedStackOp(SelectionDAG &DAG,
26630 const X86Subtarget &Subtarget,
26631 SDValue Chain, SDLoc DL) {
26632 // Implementation notes:
26633 // 1) LOCK prefix creates a full read/write reordering barrier for memory
26634 // operations issued by the current processor. As such, the location
26635 // referenced is not relevant for the ordering properties of the instruction.
26636 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
26637 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
26638 // 2) Using an immediate operand appears to be the best encoding choice
26639 // here since it doesn't require an extra register.
26640 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
26641 // is small enough it might just be measurement noise.)
26642 // 4) When choosing offsets, there are several contributing factors:
26643 // a) If there's no redzone, we default to TOS. (We could allocate a cache
26644 // line aligned stack object to improve this case.)
26645 // b) To minimize our chances of introducing a false dependence, we prefer
26646 // to offset the stack usage from TOS slightly.
26647 // c) To minimize concerns about cross thread stack usage - in particular,
26648 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
26649 // captures state in the TOS frame and accesses it from many threads -
26650 // we want to use an offset such that the offset is in a distinct cache
26651 // line from the TOS frame.
26652 //
26653 // For a general discussion of the tradeoffs and benchmark results, see:
26654 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
26655
26656 auto &MF = DAG.getMachineFunction();
26657 auto &TFL = *Subtarget.getFrameLowering();
26658 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
26659
26660 if (Subtarget.is64Bit()) {
26661 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
26662 SDValue Ops[] = {
26663 DAG.getRegister(X86::RSP, MVT::i64), // Base
26664 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26665 DAG.getRegister(0, MVT::i64), // Index
26666 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
26667 DAG.getRegister(0, MVT::i16), // Segment.
26668 Zero,
26669 Chain};
26670 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
26671 MVT::Other, Ops);
26672 return SDValue(Res, 1);
26673 }
26674
26675 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
26676 SDValue Ops[] = {
26677 DAG.getRegister(X86::ESP, MVT::i32), // Base
26678 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
26679 DAG.getRegister(0, MVT::i32), // Index
26680 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
26681 DAG.getRegister(0, MVT::i16), // Segment.
26682 Zero,
26683 Chain
26684 };
26685 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
26686 MVT::Other, Ops);
26687 return SDValue(Res, 1);
26688}
26689
26690static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
26691 SelectionDAG &DAG) {
26692 SDLoc dl(Op);
26693 AtomicOrdering FenceOrdering =
26694 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
26695 SyncScope::ID FenceSSID =
26696 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
26697
26698 // The only fence that needs an instruction is a sequentially-consistent
26699 // cross-thread fence.
26700 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
26701 FenceSSID == SyncScope::System) {
26702 if (Subtarget.hasMFence())
26703 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
26704
26705 SDValue Chain = Op.getOperand(0);
26706 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
26707 }
26708
26709 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
26710 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
26711}
26712
26713static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
26714 SelectionDAG &DAG) {
26715 MVT T = Op.getSimpleValueType();
26716 SDLoc DL(Op);
26717 unsigned Reg = 0;
26718 unsigned size = 0;
26719 switch(T.SimpleTy) {
26720 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26720)
;
26721 case MVT::i8: Reg = X86::AL; size = 1; break;
26722 case MVT::i16: Reg = X86::AX; size = 2; break;
26723 case MVT::i32: Reg = X86::EAX; size = 4; break;
26724 case MVT::i64:
26725 assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26725, __PRETTY_FUNCTION__))
;
26726 Reg = X86::RAX; size = 8;
26727 break;
26728 }
26729 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
26730 Op.getOperand(2), SDValue());
26731 SDValue Ops[] = { cpIn.getValue(0),
26732 Op.getOperand(1),
26733 Op.getOperand(3),
26734 DAG.getTargetConstant(size, DL, MVT::i8),
26735 cpIn.getValue(1) };
26736 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26737 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
26738 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
26739 Ops, T, MMO);
26740
26741 SDValue cpOut =
26742 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
26743 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
26744 MVT::i32, cpOut.getValue(2));
26745 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
26746
26747 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26748 cpOut, Success, EFLAGS.getValue(1));
26749}
26750
26751// Create MOVMSKB, taking into account whether we need to split for AVX1.
26752static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
26753 const X86Subtarget &Subtarget) {
26754 MVT InVT = V.getSimpleValueType();
26755
26756 if (InVT == MVT::v64i8) {
26757 SDValue Lo, Hi;
26758 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
26759 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
26760 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
26761 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
26762 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
26763 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
26764 DAG.getConstant(32, DL, MVT::i8));
26765 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
26766 }
26767 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
26768 SDValue Lo, Hi;
26769 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
26770 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
26771 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
26772 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
26773 DAG.getConstant(16, DL, MVT::i8));
26774 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
26775 }
26776
26777 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
26778}
26779
26780static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
26781 SelectionDAG &DAG) {
26782 SDValue Src = Op.getOperand(0);
26783 MVT SrcVT = Src.getSimpleValueType();
26784 MVT DstVT = Op.getSimpleValueType();
26785
26786 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
26787 // half to v32i1 and concatenating the result.
26788 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
26789 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26789, __PRETTY_FUNCTION__))
;
26790 assert(Subtarget.hasBWI() && "Expected BWI target")((Subtarget.hasBWI() && "Expected BWI target") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26790, __PRETTY_FUNCTION__))
;
26791 SDLoc dl(Op);
26792 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
26793 DAG.getIntPtrConstant(0, dl));
26794 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26795 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
26796 DAG.getIntPtrConstant(1, dl));
26797 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26798 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26799 }
26800
26801 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
26802 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
26803 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
26804 SDLoc dl(Op);
26805 SDValue Lo, Hi;
26806 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
26807 MVT CastVT = DstVT.getHalfNumVectorElementsVT();
26808 Lo = DAG.getBitcast(CastVT, Lo);
26809 Hi = DAG.getBitcast(CastVT, Hi);
26810 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
26811 }
26812
26813 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
26814 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
26815 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26815, __PRETTY_FUNCTION__))
;
26816 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
26817 SDLoc DL(Op);
26818 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
26819 V = getPMOVMSKB(DL, V, DAG, Subtarget);
26820 return DAG.getZExtOrTrunc(V, DL, DstVT);
26821 }
26822
26823 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26824, __PRETTY_FUNCTION__))
26824 SrcVT == MVT::i64) && "Unexpected VT!")(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26824, __PRETTY_FUNCTION__))
;
26825
26826 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26826, __PRETTY_FUNCTION__))
;
26827 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
26828 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
26829 // This conversion needs to be expanded.
26830 return SDValue();
26831
26832 SDLoc dl(Op);
26833 if (SrcVT.isVector()) {
26834 // Widen the vector in input in the case of MVT::v2i32.
26835 // Example: from MVT::v2i32 to MVT::v4i32.
26836 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
26837 SrcVT.getVectorNumElements() * 2);
26838 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
26839 DAG.getUNDEF(SrcVT));
26840 } else {
26841 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26842, __PRETTY_FUNCTION__))
26842 "Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26842, __PRETTY_FUNCTION__))
;
26843 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
26844 }
26845
26846 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
26847 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
26848
26849 if (DstVT == MVT::x86mmx)
26850 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
26851
26852 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
26853 DAG.getIntPtrConstant(0, dl));
26854}
26855
26856/// Compute the horizontal sum of bytes in V for the elements of VT.
26857///
26858/// Requires V to be a byte vector and VT to be an integer vector type with
26859/// wider elements than V's type. The width of the elements of VT determines
26860/// how many bytes of V are summed horizontally to produce each element of the
26861/// result.
26862static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
26863 const X86Subtarget &Subtarget,
26864 SelectionDAG &DAG) {
26865 SDLoc DL(V);
26866 MVT ByteVecVT = V.getSimpleValueType();
26867 MVT EltVT = VT.getVectorElementType();
26868 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26869, __PRETTY_FUNCTION__))
26869 "Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26869, __PRETTY_FUNCTION__))
;
26870 assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26871, __PRETTY_FUNCTION__))
26871 "Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26871, __PRETTY_FUNCTION__))
;
26872 unsigned VecSize = VT.getSizeInBits();
26873 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26873, __PRETTY_FUNCTION__))
;
26874
26875 // PSADBW instruction horizontally add all bytes and leave the result in i64
26876 // chunks, thus directly computes the pop count for v2i64 and v4i64.
26877 if (EltVT == MVT::i64) {
26878 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
26879 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
26880 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
26881 return DAG.getBitcast(VT, V);
26882 }
26883
26884 if (EltVT == MVT::i32) {
26885 // We unpack the low half and high half into i32s interleaved with zeros so
26886 // that we can use PSADBW to horizontally sum them. The most useful part of
26887 // this is that it lines up the results of two PSADBW instructions to be
26888 // two v2i64 vectors which concatenated are the 4 population counts. We can
26889 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
26890 SDValue Zeros = DAG.getConstant(0, DL, VT);
26891 SDValue V32 = DAG.getBitcast(VT, V);
26892 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
26893 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
26894
26895 // Do the horizontal sums into two v2i64s.
26896 Zeros = DAG.getConstant(0, DL, ByteVecVT);
26897 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
26898 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
26899 DAG.getBitcast(ByteVecVT, Low), Zeros);
26900 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
26901 DAG.getBitcast(ByteVecVT, High), Zeros);
26902
26903 // Merge them together.
26904 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
26905 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
26906 DAG.getBitcast(ShortVecVT, Low),
26907 DAG.getBitcast(ShortVecVT, High));
26908
26909 return DAG.getBitcast(VT, V);
26910 }
26911
26912 // The only element type left is i16.
26913 assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ?
static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26913, __PRETTY_FUNCTION__))
;
26914
26915 // To obtain pop count for each i16 element starting from the pop count for
26916 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
26917 // right by 8. It is important to shift as i16s as i8 vector shift isn't
26918 // directly supported.
26919 SDValue ShifterV = DAG.getConstant(8, DL, VT);
26920 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
26921 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
26922 DAG.getBitcast(ByteVecVT, V));
26923 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
26924}
26925
26926static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
26927 const X86Subtarget &Subtarget,
26928 SelectionDAG &DAG) {
26929 MVT VT = Op.getSimpleValueType();
26930 MVT EltVT = VT.getVectorElementType();
26931 int NumElts = VT.getVectorNumElements();
26932 (void)EltVT;
26933 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26933, __PRETTY_FUNCTION__))
;
26934
26935 // Implement a lookup table in register by using an algorithm based on:
26936 // http://wm.ite.pl/articles/sse-popcount.html
26937 //
26938 // The general idea is that every lower byte nibble in the input vector is an
26939 // index into a in-register pre-computed pop count table. We then split up the
26940 // input vector in two new ones: (1) a vector with only the shifted-right
26941 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
26942 // masked out higher ones) for each byte. PSHUFB is used separately with both
26943 // to index the in-register table. Next, both are added and the result is a
26944 // i8 vector where each element contains the pop count for input byte.
26945 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
26946 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
26947 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
26948 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
26949
26950 SmallVector<SDValue, 64> LUTVec;
26951 for (int i = 0; i < NumElts; ++i)
26952 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
26953 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
26954 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
26955
26956 // High nibbles
26957 SDValue FourV = DAG.getConstant(4, DL, VT);
26958 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
26959
26960 // Low nibbles
26961 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
26962
26963 // The input vector is used as the shuffle mask that index elements into the
26964 // LUT. After counting low and high nibbles, add the vector to obtain the
26965 // final pop count per i8 element.
26966 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
26967 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
26968 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
26969}
26970
26971// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
26972// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
26973static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
26974 SelectionDAG &DAG) {
26975 MVT VT = Op.getSimpleValueType();
26976 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26977, __PRETTY_FUNCTION__))
26977 "Unknown CTPOP type to handle")(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26977, __PRETTY_FUNCTION__))
;
26978 SDLoc DL(Op.getNode());
26979 SDValue Op0 = Op.getOperand(0);
26980
26981 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
26982 if (Subtarget.hasVPOPCNTDQ()) {
26983 unsigned NumElems = VT.getVectorNumElements();
26984 assert((VT.getVectorElementType() == MVT::i8 ||(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26985, __PRETTY_FUNCTION__))
26985 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26985, __PRETTY_FUNCTION__))
;
26986 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
26987 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
26988 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
26989 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
26990 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
26991 }
26992 }
26993
26994 // Decompose 256-bit ops into smaller 128-bit ops.
26995 if (VT.is256BitVector() && !Subtarget.hasInt256())
26996 return Lower256IntUnary(Op, DAG);
26997
26998 // Decompose 512-bit ops into smaller 256-bit ops.
26999 if (VT.is512BitVector() && !Subtarget.hasBWI())
27000 return Lower512IntUnary(Op, DAG);
27001
27002 // For element types greater than i8, do vXi8 pop counts and a bytesum.
27003 if (VT.getScalarType() != MVT::i8) {
27004 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
27005 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
27006 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
27007 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
27008 }
27009
27010 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
27011 if (!Subtarget.hasSSSE3())
27012 return SDValue();
27013
27014 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
27015}
27016
27017static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
27018 SelectionDAG &DAG) {
27019 assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27020, __PRETTY_FUNCTION__))
27020 "We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27020, __PRETTY_FUNCTION__))
;
27021 return LowerVectorCTPOP(Op, Subtarget, DAG);
27022}
27023
27024static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
27025 MVT VT = Op.getSimpleValueType();
27026 SDValue In = Op.getOperand(0);
27027 SDLoc DL(Op);
27028
27029 // For scalars, its still beneficial to transfer to/from the SIMD unit to
27030 // perform the BITREVERSE.
27031 if (!VT.isVector()) {
27032 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
27033 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
27034 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
27035 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
27036 DAG.getIntPtrConstant(0, DL));
27037 }
27038
27039 int NumElts = VT.getVectorNumElements();
27040 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
27041
27042 // Decompose 256-bit ops into smaller 128-bit ops.
27043 if (VT.is256BitVector())
27044 return Lower256IntUnary(Op, DAG);
27045
27046 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27047, __PRETTY_FUNCTION__))
27047 "Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27047, __PRETTY_FUNCTION__))
;
27048
27049 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
27050 // perform the BSWAP in the shuffle.
27051 // Its best to shuffle using the second operand as this will implicitly allow
27052 // memory folding for multiple vectors.
27053 SmallVector<SDValue, 16> MaskElts;
27054 for (int i = 0; i != NumElts; ++i) {
27055 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
27056 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
27057 int PermuteByte = SourceByte | (2 << 5);
27058 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
27059 }
27060 }
27061
27062 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
27063 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
27064 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
27065 Res, Mask);
27066 return DAG.getBitcast(VT, Res);
27067}
27068
27069static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
27070 SelectionDAG &DAG) {
27071 MVT VT = Op.getSimpleValueType();
27072
27073 if (Subtarget.hasXOP() && !VT.is512BitVector())
27074 return LowerBITREVERSE_XOP(Op, DAG);
27075
27076 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27076, __PRETTY_FUNCTION__))
;
27077
27078 SDValue In = Op.getOperand(0);
27079 SDLoc DL(Op);
27080
27081 // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
27082 // lowering.
27083 if (VT == MVT::v8i64 || VT == MVT::v16i32) {
27084 assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE")((!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasBWI() && \"BWI should Expand BITREVERSE\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27084, __PRETTY_FUNCTION__))
;
27085 return Lower512IntUnary(Op, DAG);
27086 }
27087
27088 unsigned NumElts = VT.getVectorNumElements();
27089 assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27090, __PRETTY_FUNCTION__))
27090 "Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27090, __PRETTY_FUNCTION__))
;
27091
27092 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
27093 if (VT.is256BitVector() && !Subtarget.hasInt256())
27094 return Lower256IntUnary(Op, DAG);
27095
27096 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
27097 // two nibbles and a PSHUFB lookup to find the bitreverse of each
27098 // 0-15 value (moved to the other nibble).
27099 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
27100 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
27101 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
27102
27103 const int LoLUT[16] = {
27104 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
27105 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
27106 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
27107 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
27108 const int HiLUT[16] = {
27109 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
27110 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
27111 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
27112 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
27113
27114 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
27115 for (unsigned i = 0; i < NumElts; ++i) {
27116 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
27117 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
27118 }
27119
27120 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
27121 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
27122 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
27123 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
27124 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
27125}
27126
27127static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
27128 const X86Subtarget &Subtarget) {
27129 unsigned NewOpc = 0;
27130 switch (N->getOpcode()) {
27131 case ISD::ATOMIC_LOAD_ADD:
27132 NewOpc = X86ISD::LADD;
27133 break;
27134 case ISD::ATOMIC_LOAD_SUB:
27135 NewOpc = X86ISD::LSUB;
27136 break;
27137 case ISD::ATOMIC_LOAD_OR:
27138 NewOpc = X86ISD::LOR;
27139 break;
27140 case ISD::ATOMIC_LOAD_XOR:
27141 NewOpc = X86ISD::LXOR;
27142 break;
27143 case ISD::ATOMIC_LOAD_AND:
27144 NewOpc = X86ISD::LAND;
27145 break;
27146 default:
27147 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27147)
;
27148 }
27149
27150 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
27151
27152 return DAG.getMemIntrinsicNode(
27153 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
27154 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
27155 /*MemVT=*/N->getSimpleValueType(0), MMO);
27156}
27157
27158/// Lower atomic_load_ops into LOCK-prefixed operations.
27159static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
27160 const X86Subtarget &Subtarget) {
27161 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
27162 SDValue Chain = N->getOperand(0);
27163 SDValue LHS = N->getOperand(1);
27164 SDValue RHS = N->getOperand(2);
27165 unsigned Opc = N->getOpcode();
27166 MVT VT = N->getSimpleValueType(0);
27167 SDLoc DL(N);
27168
27169 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
27170 // can only be lowered when the result is unused. They should have already
27171 // been transformed into a cmpxchg loop in AtomicExpand.
27172 if (N->hasAnyUseOfValue(0)) {
27173 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
27174 // select LXADD if LOCK_SUB can't be selected.
27175 if (Opc == ISD::ATOMIC_LOAD_SUB) {
27176 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
27177 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
27178 RHS, AN->getMemOperand());
27179 }
27180 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27181, __PRETTY_FUNCTION__))
27181 "Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27181, __PRETTY_FUNCTION__))
;
27182 return N;
27183 }
27184
27185 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
27186 // The core idea here is that since the memory location isn't actually
27187 // changing, all we need is a lowering for the *ordering* impacts of the
27188 // atomicrmw. As such, we can chose a different operation and memory
27189 // location to minimize impact on other code.
27190 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
27191 // On X86, the only ordering which actually requires an instruction is
27192 // seq_cst which isn't SingleThread, everything just needs to be preserved
27193 // during codegen and then dropped. Note that we expect (but don't assume),
27194 // that orderings other than seq_cst and acq_rel have been canonicalized to
27195 // a store or load.
27196 if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
27197 AN->getSyncScopeID() == SyncScope::System) {
27198 // Prefer a locked operation against a stack location to minimize cache
27199 // traffic. This assumes that stack locations are very likely to be
27200 // accessed only by the owning thread.
27201 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
27202 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27202, __PRETTY_FUNCTION__))
;
27203 // NOTE: The getUNDEF is needed to give something for the unused result 0.
27204 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
27205 DAG.getUNDEF(VT), NewChain);
27206 }
27207 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
27208 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
27209 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27209, __PRETTY_FUNCTION__))
;
27210 // NOTE: The getUNDEF is needed to give something for the unused result 0.
27211 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
27212 DAG.getUNDEF(VT), NewChain);
27213 }
27214
27215 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
27216 // RAUW the chain, but don't worry about the result, as it's unused.
27217 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27217, __PRETTY_FUNCTION__))
;
27218 // NOTE: The getUNDEF is needed to give something for the unused result 0.
27219 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
27220 DAG.getUNDEF(VT), LockOp.getValue(1));
27221}
27222
27223static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
27224 const X86Subtarget &Subtarget) {
27225 auto *Node = cast<AtomicSDNode>(Op.getNode());
27226 SDLoc dl(Node);
27227 EVT VT = Node->getMemoryVT();
27228
27229 bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
27230 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
27231
27232 // If this store is not sequentially consistent and the type is legal
27233 // we can just keep it.
27234 if (!IsSeqCst && IsTypeLegal)
27235 return Op;
27236
27237 if (VT == MVT::i64 && !IsTypeLegal) {
27238 // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
27239 // FIXME: Use movlps with SSE1.
27240 // FIXME: Use fist with X87.
27241 bool NoImplicitFloatOps =
27242 DAG.getMachineFunction().getFunction().hasFnAttribute(
27243 Attribute::NoImplicitFloat);
27244 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
27245 Subtarget.hasSSE2()) {
27246 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
27247 Node->getOperand(2));
27248 SDVTList Tys = DAG.getVTList(MVT::Other);
27249 SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
27250 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
27251 Ops, MVT::i64,
27252 Node->getMemOperand());
27253
27254 // If this is a sequentially consistent store, also emit an appropriate
27255 // barrier.
27256 if (IsSeqCst)
27257 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
27258
27259 return Chain;
27260 }
27261 }
27262
27263 // Convert seq_cst store -> xchg
27264 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
27265 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
27266 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
27267 Node->getMemoryVT(),
27268 Node->getOperand(0),
27269 Node->getOperand(1), Node->getOperand(2),
27270 Node->getMemOperand());
27271 return Swap.getValue(1);
27272}
27273
27274static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
27275 SDNode *N = Op.getNode();
27276 MVT VT = N->getSimpleValueType(0);
27277
27278 // Let legalize expand this if it isn't a legal type yet.
27279 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
27280 return SDValue();
27281
27282 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27283 SDLoc DL(N);
27284
27285 // Set the carry flag.
27286 SDValue Carry = Op.getOperand(2);
27287 EVT CarryVT = Carry.getValueType();
27288 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
27289 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
27290 Carry, DAG.getConstant(NegOne, DL, CarryVT));
27291
27292 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
27293 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
27294 Op.getOperand(1), Carry.getValue(1));
27295
27296 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
27297 if (N->getValueType(1) == MVT::i1)
27298 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
27299
27300 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
27301}
27302
27303static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
27304 SelectionDAG &DAG) {
27305 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27305, __PRETTY_FUNCTION__))
;
27306
27307 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
27308 // which returns the values as { float, float } (in XMM0) or
27309 // { double, double } (which is returned in XMM0, XMM1).
27310 SDLoc dl(Op);
27311 SDValue Arg = Op.getOperand(0);
27312 EVT ArgVT = Arg.getValueType();
27313 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27314
27315 TargetLowering::ArgListTy Args;
27316 TargetLowering::ArgListEntry Entry;
27317
27318 Entry.Node = Arg;
27319 Entry.Ty = ArgTy;
27320 Entry.IsSExt = false;
27321 Entry.IsZExt = false;
27322 Args.push_back(Entry);
27323
27324 bool isF64 = ArgVT == MVT::f64;
27325 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
27326 // the small struct {f32, f32} is returned in (eax, edx). For f64,
27327 // the results are returned via SRet in memory.
27328 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27329 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
27330 const char *LibcallName = TLI.getLibcallName(LC);
27331 SDValue Callee =
27332 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
27333
27334 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
27335 : (Type *)VectorType::get(ArgTy, 4);
27336
27337 TargetLowering::CallLoweringInfo CLI(DAG);
27338 CLI.setDebugLoc(dl)
27339 .setChain(DAG.getEntryNode())
27340 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
27341
27342 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
27343
27344 if (isF64)
27345 // Returned in xmm0 and xmm1.
27346 return CallResult.first;
27347
27348 // Returned in bits 0:31 and 32:64 xmm0.
27349 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
27350 CallResult.first, DAG.getIntPtrConstant(0, dl));
27351 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
27352 CallResult.first, DAG.getIntPtrConstant(1, dl));
27353 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
27354 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
27355}
27356
27357/// Widen a vector input to a vector of NVT. The
27358/// input vector must have the same element type as NVT.
27359static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
27360 bool FillWithZeroes = false) {
27361 // Check if InOp already has the right width.
27362 MVT InVT = InOp.getSimpleValueType();
27363 if (InVT == NVT)
27364 return InOp;
27365
27366 if (InOp.isUndef())
27367 return DAG.getUNDEF(NVT);
27368
27369 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27370, __PRETTY_FUNCTION__))
27370 "input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27370, __PRETTY_FUNCTION__))
;
27371
27372 unsigned InNumElts = InVT.getVectorNumElements();
27373 unsigned WidenNumElts = NVT.getVectorNumElements();
27374 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27375, __PRETTY_FUNCTION__))
27375 "Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27375, __PRETTY_FUNCTION__))
;
27376
27377 SDLoc dl(InOp);
27378 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
27379 InOp.getNumOperands() == 2) {
27380 SDValue N1 = InOp.getOperand(1);
27381 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
27382 N1.isUndef()) {
27383 InOp = InOp.getOperand(0);
27384 InVT = InOp.getSimpleValueType();
27385 InNumElts = InVT.getVectorNumElements();
27386 }
27387 }
27388 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
27389 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
27390 SmallVector<SDValue, 16> Ops;
27391 for (unsigned i = 0; i < InNumElts; ++i)
27392 Ops.push_back(InOp.getOperand(i));
27393
27394 EVT EltVT = InOp.getOperand(0).getValueType();
27395
27396 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
27397 DAG.getUNDEF(EltVT);
27398 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
27399 Ops.push_back(FillVal);
27400 return DAG.getBuildVector(NVT, dl, Ops);
27401 }
27402 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
27403 DAG.getUNDEF(NVT);
27404 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
27405 InOp, DAG.getIntPtrConstant(0, dl));
27406}
27407
27408static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
27409 SelectionDAG &DAG) {
27410 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27411, __PRETTY_FUNCTION__))
27411 "MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27411, __PRETTY_FUNCTION__))
;
27412
27413 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
27414 SDValue Src = N->getValue();
27415 MVT VT = Src.getSimpleValueType();
27416 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27416, __PRETTY_FUNCTION__))
;
27417 SDLoc dl(Op);
27418
27419 SDValue Scale = N->getScale();
27420 SDValue Index = N->getIndex();
27421 SDValue Mask = N->getMask();
27422 SDValue Chain = N->getChain();
27423 SDValue BasePtr = N->getBasePtr();
27424
27425 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
27426 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27426, __PRETTY_FUNCTION__))
;
27427 // If the index is v2i64 and we have VLX we can use xmm for data and index.
27428 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
27429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27430 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
27431 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
27432 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
27433 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
27434 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
27435 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
27436 return SDValue(NewScatter.getNode(), 1);
27437 }
27438 return SDValue();
27439 }
27440
27441 MVT IndexVT = Index.getSimpleValueType();
27442 MVT MaskVT = Mask.getSimpleValueType();
27443
27444 // If the index is v2i32, we're being called by type legalization and we
27445 // should just let the default handling take care of it.
27446 if (IndexVT == MVT::v2i32)
27447 return SDValue();
27448
27449 // If we don't have VLX and neither the passthru or index is 512-bits, we
27450 // need to widen until one is.
27451 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
27452 !Index.getSimpleValueType().is512BitVector()) {
27453 // Determine how much we need to widen by to get a 512-bit type.
27454 unsigned Factor = std::min(512/VT.getSizeInBits(),
27455 512/IndexVT.getSizeInBits());
27456 unsigned NumElts = VT.getVectorNumElements() * Factor;
27457
27458 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
27459 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
27460 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
27461
27462 Src = ExtendToType(Src, VT, DAG);
27463 Index = ExtendToType(Index, IndexVT, DAG);
27464 Mask = ExtendToType(Mask, MaskVT, DAG, true);
27465 }
27466
27467 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
27468 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
27469 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
27470 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
27471 return SDValue(NewScatter.getNode(), 1);
27472}
27473
27474static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
27475 SelectionDAG &DAG) {
27476
27477 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
27478 MVT VT = Op.getSimpleValueType();
27479 MVT ScalarVT = VT.getScalarType();
27480 SDValue Mask = N->getMask();
27481 MVT MaskVT = Mask.getSimpleValueType();
27482 SDValue PassThru = N->getPassThru();
27483 SDLoc dl(Op);
27484
27485 // Handle AVX masked loads which don't support passthru other than 0.
27486 if (MaskVT.getVectorElementType() != MVT::i1) {
27487 // We also allow undef in the isel pattern.
27488 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
27489 return Op;
27490
27491 SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
27492 N->getBasePtr(), Mask,
27493 getZeroVector(VT, Subtarget, DAG, dl),
27494 N->getMemoryVT(), N->getMemOperand(),
27495 N->getExtensionType(),
27496 N->isExpandingLoad());
27497 // Emit a blend.
27498 SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
27499 PassThru);
27500 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
27501 }
27502
27503 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27504, __PRETTY_FUNCTION__))
27504 "Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27504, __PRETTY_FUNCTION__))
;
27505
27506 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27507, __PRETTY_FUNCTION__))
27507 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27507, __PRETTY_FUNCTION__))
;
27508
27509 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27510, __PRETTY_FUNCTION__))
27510 "Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27510, __PRETTY_FUNCTION__))
;
27511
27512 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27515, __PRETTY_FUNCTION__))
27513 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27515, __PRETTY_FUNCTION__))
27514 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27515, __PRETTY_FUNCTION__))
27515 "Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27515, __PRETTY_FUNCTION__))
;
27516
27517 // This operation is legal for targets with VLX, but without
27518 // VLX the vector should be widened to 512 bit
27519 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
27520 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
27521 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
27522
27523 // Mask element has to be i1.
27524 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27525, __PRETTY_FUNCTION__))
27525 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27525, __PRETTY_FUNCTION__))
;
27526
27527 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
27528
27529 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
27530 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
27531 N->getBasePtr(), Mask, PassThru,
27532 N->getMemoryVT(), N->getMemOperand(),
27533 N->getExtensionType(),
27534 N->isExpandingLoad());
27535
27536 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
27537 NewLoad.getValue(0),
27538 DAG.getIntPtrConstant(0, dl));
27539 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
27540 return DAG.getMergeValues(RetOps, dl);
27541}
27542
27543static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
27544 SelectionDAG &DAG) {
27545 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
27546 SDValue DataToStore = N->getValue();
27547 MVT VT = DataToStore.getSimpleValueType();
27548 MVT ScalarVT = VT.getScalarType();
27549 SDValue Mask = N->getMask();
27550 SDLoc dl(Op);
27551
27552 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27553, __PRETTY_FUNCTION__))
27553 "Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27553, __PRETTY_FUNCTION__))
;
27554
27555 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27556, __PRETTY_FUNCTION__))
27556 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27556, __PRETTY_FUNCTION__))
;
27557
27558 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27559, __PRETTY_FUNCTION__))
27559 "Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27559, __PRETTY_FUNCTION__))
;
27560
27561 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27564, __PRETTY_FUNCTION__))
27562 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27564, __PRETTY_FUNCTION__))
27563 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27564, __PRETTY_FUNCTION__))
27564 "Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27564, __PRETTY_FUNCTION__))
;
27565
27566 // This operation is legal for targets with VLX, but without
27567 // VLX the vector should be widened to 512 bit
27568 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
27569 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
27570
27571 // Mask element has to be i1.
27572 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27573, __PRETTY_FUNCTION__))
27573 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27573, __PRETTY_FUNCTION__))
;
27574
27575 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
27576
27577 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
27578 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
27579 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
27580 Mask, N->getMemoryVT(), N->getMemOperand(),
27581 N->isTruncatingStore(), N->isCompressingStore());
27582}
27583
27584static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
27585 SelectionDAG &DAG) {
27586 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27587, __PRETTY_FUNCTION__))
27587 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27587, __PRETTY_FUNCTION__))
;
27588
27589 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
27590 SDLoc dl(Op);
27591 MVT VT = Op.getSimpleValueType();
27592 SDValue Index = N->getIndex();
27593 SDValue Mask = N->getMask();
27594 SDValue PassThru = N->getPassThru();
27595 MVT IndexVT = Index.getSimpleValueType();
27596 MVT MaskVT = Mask.getSimpleValueType();
27597
27598 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27598, __PRETTY_FUNCTION__))
;
27599
27600 // If the index is v2i32, we're being called by type legalization.
27601 if (IndexVT == MVT::v2i32)
27602 return SDValue();
27603
27604 // If we don't have VLX and neither the passthru or index is 512-bits, we
27605 // need to widen until one is.
27606 MVT OrigVT = VT;
27607 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
27608 !IndexVT.is512BitVector()) {
27609 // Determine how much we need to widen by to get a 512-bit type.
27610 unsigned Factor = std::min(512/VT.getSizeInBits(),
27611 512/IndexVT.getSizeInBits());
27612
27613 unsigned NumElts = VT.getVectorNumElements() * Factor;
27614
27615 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
27616 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
27617 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
27618
27619 PassThru = ExtendToType(PassThru, VT, DAG);
27620 Index = ExtendToType(Index, IndexVT, DAG);
27621 Mask = ExtendToType(Mask, MaskVT, DAG, true);
27622 }
27623
27624 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
27625 N->getScale() };
27626 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
27627 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
27628 N->getMemOperand());
27629 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
27630 NewGather, DAG.getIntPtrConstant(0, dl));
27631 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
27632}
27633
27634SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
27635 SelectionDAG &DAG) const {
27636 // TODO: Eventually, the lowering of these nodes should be informed by or
27637 // deferred to the GC strategy for the function in which they appear. For
27638 // now, however, they must be lowered to something. Since they are logically
27639 // no-ops in the case of a null GC strategy (or a GC strategy which does not
27640 // require special handling for these nodes), lower them as literal NOOPs for
27641 // the time being.
27642 SmallVector<SDValue, 2> Ops;
27643
27644 Ops.push_back(Op.getOperand(0));
27645 if (Op->getGluedNode())
27646 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
27647
27648 SDLoc OpDL(Op);
27649 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
27650 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
27651
27652 return NOOP;
27653}
27654
27655SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
27656 SelectionDAG &DAG) const {
27657 // TODO: Eventually, the lowering of these nodes should be informed by or
27658 // deferred to the GC strategy for the function in which they appear. For
27659 // now, however, they must be lowered to something. Since they are logically
27660 // no-ops in the case of a null GC strategy (or a GC strategy which does not
27661 // require special handling for these nodes), lower them as literal NOOPs for
27662 // the time being.
27663 SmallVector<SDValue, 2> Ops;
27664
27665 Ops.push_back(Op.getOperand(0));
27666 if (Op->getGluedNode())
27667 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
27668
27669 SDLoc OpDL(Op);
27670 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
27671 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
27672
27673 return NOOP;
27674}
27675
27676SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
27677 RTLIB::Libcall Call) const {
27678 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
27679 MakeLibCallOptions CallOptions;
27680 return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
27681}
27682
27683/// Provide custom lowering hooks for some operations.
27684SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
27685 switch (Op.getOpcode()) {
27686 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27686)
;
27687 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
27688 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
27689 return LowerCMP_SWAP(Op, Subtarget, DAG);
27690 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
27691 case ISD::ATOMIC_LOAD_ADD:
27692 case ISD::ATOMIC_LOAD_SUB:
27693 case ISD::ATOMIC_LOAD_OR:
27694 case ISD::ATOMIC_LOAD_XOR:
27695 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
27696 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
27697 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
27698 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
27699 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
27700 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
27701 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
27702 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
27703 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
27704 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
27705 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
27706 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
27707 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
27708 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
27709 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
27710 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
27711 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
27712 case ISD::SHL_PARTS:
27713 case ISD::SRA_PARTS:
27714 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
27715 case ISD::FSHL:
27716 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
27717 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
27718 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
27719 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
27720 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
27721 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
27722 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
27723 case ISD::ZERO_EXTEND_VECTOR_INREG:
27724 case ISD::SIGN_EXTEND_VECTOR_INREG:
27725 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
27726 case ISD::FP_TO_SINT:
27727 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
27728 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
27729 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
27730 case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);
27731 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
27732 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
27733 case ISD::FADD:
27734 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
27735 case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
27736 case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
27737 case ISD::FABS:
27738 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
27739 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
27740 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
27741 case ISD::SETCC: return LowerSETCC(Op, DAG);
27742 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
27743 case ISD::SELECT: return LowerSELECT(Op, DAG);
27744 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
27745 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
27746 case ISD::VASTART: return LowerVASTART(Op, DAG);
27747 case ISD::VAARG: return LowerVAARG(Op, DAG);
27748 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
27749 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
27750 case ISD::INTRINSIC_VOID:
27751 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
27752 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
27753 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
27754 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
27755 case ISD::FRAME_TO_ARGS_OFFSET:
27756 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
27757 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
27758 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
27759 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
27760 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
27761 case ISD::EH_SJLJ_SETUP_DISPATCH:
27762 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
27763 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
27764 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
27765 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
27766 case ISD::CTLZ:
27767 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
27768 case ISD::CTTZ:
27769 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
27770 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
27771 case ISD::MULHS:
27772 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
27773 case ISD::ROTL:
27774 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
27775 case ISD::SRA:
27776 case ISD::SRL:
27777 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
27778 case ISD::SADDO:
27779 case ISD::UADDO:
27780 case ISD::SSUBO:
27781 case ISD::USUBO:
27782 case ISD::SMULO:
27783 case ISD::UMULO: return LowerXALUO(Op, DAG);
27784 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
27785 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
27786 case ISD::ADDCARRY:
27787 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
27788 case ISD::ADD:
27789 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
27790 case ISD::UADDSAT:
27791 case ISD::SADDSAT:
27792 case ISD::USUBSAT:
27793 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
27794 case ISD::SMAX:
27795 case ISD::SMIN:
27796 case ISD::UMAX:
27797 case ISD::UMIN: return LowerMINMAX(Op, DAG);
27798 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
27799 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
27800 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
27801 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
27802 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
27803 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
27804 case ISD::GC_TRANSITION_START:
27805 return LowerGC_TRANSITION_START(Op, DAG);
27806 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
27807 }
27808}
27809
27810/// Places new result values for the node in Results (their number
27811/// and types must exactly match those of the original return values of
27812/// the node), or leaves Results empty, which indicates that the node is not
27813/// to be custom lowered after all.
27814void X86TargetLowering::LowerOperationWrapper(SDNode *N,
27815 SmallVectorImpl<SDValue> &Results,
27816 SelectionDAG &DAG) const {
27817 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
27818
27819 if (!Res.getNode())
27820 return;
27821
27822 // If the original node has one result, take the return value from
27823 // LowerOperation as is. It might not be result number 0.
27824 if (N->getNumValues() == 1) {
27825 Results.push_back(Res);
27826 return;
27827 }
27828
27829 // If the original node has multiple results, then the return node should
27830 // have the same number of results.
27831 assert((N->getNumValues() == Res->getNumValues()) &&(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27832, __PRETTY_FUNCTION__))
27832 "Lowering returned the wrong number of results!")(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27832, __PRETTY_FUNCTION__))
;
27833
27834 // Places new result values base on N result number.
27835 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
27836 Results.push_back(Res.getValue(I));
27837}
27838
27839/// Replace a node with an illegal result type with a new node built out of
27840/// custom code.
27841void X86TargetLowering::ReplaceNodeResults(SDNode *N,
27842 SmallVectorImpl<SDValue>&Results,
27843 SelectionDAG &DAG) const {
27844 SDLoc dl(N);
27845 switch (N->getOpcode()) {
27846 default:
27847#ifndef NDEBUG
27848 dbgs() << "ReplaceNodeResults: ";
27849 N->dump(&DAG);
27850#endif
27851 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27851)
;
27852 case ISD::CTPOP: {
27853 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27853, __PRETTY_FUNCTION__))
;
27854 // Use a v2i64 if possible.
27855 bool NoImplicitFloatOps =
27856 DAG.getMachineFunction().getFunction().hasFnAttribute(
27857 Attribute::NoImplicitFloat);
27858 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
27859 SDValue Wide =
27860 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
27861 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
27862 // Bit count should fit in 32-bits, extract it as that and then zero
27863 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
27864 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
27865 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
27866 DAG.getIntPtrConstant(0, dl));
27867 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
27868 Results.push_back(Wide);
27869 }
27870 return;
27871 }
27872 case ISD::MUL: {
27873 EVT VT = N->getValueType(0);
27874 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27875, __PRETTY_FUNCTION__))
27875 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27875, __PRETTY_FUNCTION__))
;
27876 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
27877 // elements are needed.
27878 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27879 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
27880 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
27881 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
27882 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
27883 unsigned NumConcats = 16 / VT.getVectorNumElements();
27884 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
27885 ConcatOps[0] = Res;
27886 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
27887 Results.push_back(Res);
27888 return;
27889 }
27890 case X86ISD::VPMADDWD:
27891 case X86ISD::AVG: {
27892 // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
27893 // X86ISD::AVG/VPMADDWD by widening.
27894 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27894, __PRETTY_FUNCTION__))
;
27895
27896 EVT VT = N->getValueType(0);
27897 EVT InVT = N->getOperand(0).getValueType();
27898 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27899, __PRETTY_FUNCTION__))
27899 "Expected a VT that divides into 128 bits.")((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27899, __PRETTY_FUNCTION__))
;
27900 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27901, __PRETTY_FUNCTION__))
27901 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27901, __PRETTY_FUNCTION__))
;
27902 unsigned NumConcat = 128 / InVT.getSizeInBits();
27903
27904 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
27905 InVT.getVectorElementType(),
27906 NumConcat * InVT.getVectorNumElements());
27907 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
27908 VT.getVectorElementType(),
27909 NumConcat * VT.getVectorNumElements());
27910
27911 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
27912 Ops[0] = N->getOperand(0);
27913 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
27914 Ops[0] = N->getOperand(1);
27915 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
27916
27917 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
27918 Results.push_back(Res);
27919 return;
27920 }
27921 case ISD::ABS: {
27922 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27923 assert(N->getValueType(0) == MVT::i64 &&((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27924, __PRETTY_FUNCTION__))
27924 "Unexpected type (!= i64) on ABS.")((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27924, __PRETTY_FUNCTION__))
;
27925 MVT HalfT = MVT::i32;
27926 SDValue Lo, Hi, Tmp;
27927 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
27928
27929 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
27930 DAG.getConstant(0, dl, HalfT));
27931 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
27932 DAG.getConstant(1, dl, HalfT));
27933 Tmp = DAG.getNode(
27934 ISD::SRA, dl, HalfT, Hi,
27935 DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
27936 TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
27937 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
27938 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
27939 SDValue(Lo.getNode(), 1));
27940 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
27941 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
27942 Results.push_back(Lo);
27943 Results.push_back(Hi);
27944 return;
27945 }
27946 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
27947 case X86ISD::FMINC:
27948 case X86ISD::FMIN:
27949 case X86ISD::FMAXC:
27950 case X86ISD::FMAX: {
27951 EVT VT = N->getValueType(0);
27952 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27952, __PRETTY_FUNCTION__))
;
27953 SDValue UNDEF = DAG.getUNDEF(VT);
27954 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
27955 N->getOperand(0), UNDEF);
27956 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
27957 N->getOperand(1), UNDEF);
27958 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
27959 return;
27960 }
27961 case ISD::SDIV:
27962 case ISD::UDIV:
27963 case ISD::SREM:
27964 case ISD::UREM: {
27965 EVT VT = N->getValueType(0);
27966 if (VT.isVector()) {
27967 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27968, __PRETTY_FUNCTION__))
27968 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27968, __PRETTY_FUNCTION__))
;
27969 // If this RHS is a constant splat vector we can widen this and let
27970 // division/remainder by constant optimize it.
27971 // TODO: Can we do something for non-splat?
27972 APInt SplatVal;
27973 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
27974 unsigned NumConcats = 128 / VT.getSizeInBits();
27975 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
27976 Ops0[0] = N->getOperand(0);
27977 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
27978 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
27979 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
27980 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
27981 Results.push_back(Res);
27982 }
27983 return;
27984 }
27985
27986 LLVM_FALLTHROUGH[[gnu::fallthrough]];
27987 }
27988 case ISD::SDIVREM:
27989 case ISD::UDIVREM: {
27990 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
27991 Results.push_back(V);
27992 return;
27993 }
27994 case ISD::TRUNCATE: {
27995 MVT VT = N->getSimpleValueType(0);
27996 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
27997 return;
27998
27999 // The generic legalizer will try to widen the input type to the same
28000 // number of elements as the widened result type. But this isn't always
28001 // the best thing so do some custom legalization to avoid some cases.
28002 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
28003 SDValue In = N->getOperand(0);
28004 EVT InVT = In.getValueType();
28005
28006 unsigned InBits = InVT.getSizeInBits();
28007 if (128 % InBits == 0) {
28008 // 128 bit and smaller inputs should avoid truncate all together and
28009 // just use a build_vector that will become a shuffle.
28010 // TODO: Widen and use a shuffle directly?
28011 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
28012 EVT EltVT = VT.getVectorElementType();
28013 unsigned WidenNumElts = WidenVT.getVectorNumElements();
28014 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
28015 // Use the original element count so we don't do more scalar opts than
28016 // necessary.
28017 unsigned MinElts = VT.getVectorNumElements();
28018 for (unsigned i=0; i < MinElts; ++i) {
28019 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
28020 DAG.getIntPtrConstant(i, dl));
28021 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
28022 }
28023 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
28024 return;
28025 }
28026 // With AVX512 there are some cases that can use a target specific
28027 // truncate node to go from 256/512 to less than 128 with zeros in the
28028 // upper elements of the 128 bit result.
28029 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
28030 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
28031 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
28032 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
28033 return;
28034 }
28035 // There's one case we can widen to 512 bits and use VTRUNC.
28036 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
28037 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
28038 DAG.getUNDEF(MVT::v4i64));
28039 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
28040 return;
28041 }
28042 }
28043 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
28044 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
28045 isTypeLegal(MVT::v4i64)) {
28046 // Input needs to be split and output needs to widened. Let's use two
28047 // VTRUNCs, and shuffle their results together into the wider type.
28048 SDValue Lo, Hi;
28049 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
28050
28051 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
28052 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
28053 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
28054 { 0, 1, 2, 3, 16, 17, 18, 19,
28055 -1, -1, -1, -1, -1, -1, -1, -1 });
28056 Results.push_back(Res);
28057 return;
28058 }
28059
28060 return;
28061 }
28062 case ISD::ANY_EXTEND:
28063 // Right now, only MVT::v8i8 has Custom action for an illegal type.
28064 // It's intended to custom handle the input type.
28065 assert(N->getValueType(0) == MVT::v8i8 &&((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28066, __PRETTY_FUNCTION__))
28066 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28066, __PRETTY_FUNCTION__))
;
28067 return;
28068 case ISD::SIGN_EXTEND:
28069 case ISD::ZERO_EXTEND: {
28070 EVT VT = N->getValueType(0);
28071 SDValue In = N->getOperand(0);
28072 EVT InVT = In.getValueType();
28073 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
28074 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
28075 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28076, __PRETTY_FUNCTION__))
28076 "Unexpected type action!")((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28076, __PRETTY_FUNCTION__))
;
28077 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28077, __PRETTY_FUNCTION__))
;
28078 // Custom split this so we can extend i8/i16->i32 invec. This is better
28079 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
28080 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
28081 // we allow the sra from the extend to i32 to be shared by the split.
28082 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
28083
28084 // Fill a vector with sign bits for each element.
28085 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
28086 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
28087
28088 // Create an unpackl and unpackh to interleave the sign bits then bitcast
28089 // to v2i64.
28090 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
28091 {0, 4, 1, 5});
28092 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
28093 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
28094 {2, 6, 3, 7});
28095 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
28096
28097 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28098 Results.push_back(Res);
28099 return;
28100 }
28101
28102 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
28103 if (!InVT.is128BitVector()) {
28104 // Not a 128 bit vector, but maybe type legalization will promote
28105 // it to 128 bits.
28106 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
28107 return;
28108 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
28109 if (!InVT.is128BitVector())
28110 return;
28111
28112 // Promote the input to 128 bits. Type legalization will turn this into
28113 // zext_inreg/sext_inreg.
28114 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
28115 }
28116
28117 // Perform custom splitting instead of the two stage extend we would get
28118 // by default.
28119 EVT LoVT, HiVT;
28120 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
28121 assert(isTypeLegal(LoVT) && "Split VT not legal?")((isTypeLegal(LoVT) && "Split VT not legal?") ? static_cast
<void> (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28121, __PRETTY_FUNCTION__))
;
28122
28123 SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
28124
28125 // We need to shift the input over by half the number of elements.
28126 unsigned NumElts = InVT.getVectorNumElements();
28127 unsigned HalfNumElts = NumElts / 2;
28128 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
28129 for (unsigned i = 0; i != HalfNumElts; ++i)
28130 ShufMask[i] = i + HalfNumElts;
28131
28132 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
28133 Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
28134
28135 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28136 Results.push_back(Res);
28137 }
28138 return;
28139 }
28140 case ISD::FP_TO_SINT:
28141 case ISD::FP_TO_UINT: {
28142 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
28143 EVT VT = N->getValueType(0);
28144 SDValue Src = N->getOperand(0);
28145 EVT SrcVT = Src.getValueType();
28146
28147 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
28148 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28149, __PRETTY_FUNCTION__))
28149 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28149, __PRETTY_FUNCTION__))
;
28150
28151 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
28152 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
28153 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
28154 VT.getVectorNumElements());
28155 SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
28156
28157 // Preserve what we know about the size of the original result. Except
28158 // when the result is v2i32 since we can't widen the assert.
28159 if (PromoteVT != MVT::v2i32)
28160 Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
28161 : ISD::AssertSext,
28162 dl, PromoteVT, Res,
28163 DAG.getValueType(VT.getVectorElementType()));
28164
28165 // Truncate back to the original width.
28166 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
28167
28168 // Now widen to 128 bits.
28169 unsigned NumConcats = 128 / VT.getSizeInBits();
28170 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
28171 VT.getVectorNumElements() * NumConcats);
28172 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
28173 ConcatOps[0] = Res;
28174 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
28175 Results.push_back(Res);
28176 return;
28177 }
28178
28179
28180 if (VT == MVT::v2i32) {
28181 assert((IsSigned || Subtarget.hasAVX512()) &&(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28182, __PRETTY_FUNCTION__))
28182 "Can only handle signed conversion without AVX512")(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28182, __PRETTY_FUNCTION__))
;
28183 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28183, __PRETTY_FUNCTION__))
;
28184 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28185, __PRETTY_FUNCTION__))
28185 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28185, __PRETTY_FUNCTION__))
;
28186 if (Src.getValueType() == MVT::v2f64) {
28187 if (!IsSigned && !Subtarget.hasVLX()) {
28188 // If we have VLX we can emit a target specific FP_TO_UINT node,
28189 // otherwise we can defer to the generic legalizer which will widen
28190 // the input as well. This will be further widened during op
28191 // legalization to v8i32<-v8f64.
28192 return;
28193 }
28194 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
28195 SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
28196 Results.push_back(Res);
28197 return;
28198 }
28199
28200 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
28201 // so early out here.
28202 return;
28203 }
28204
28205 assert(!VT.isVector() && "Vectors should have been handled above!")((!VT.isVector() && "Vectors should have been handled above!"
) ? static_cast<void> (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28205, __PRETTY_FUNCTION__))
;
28206
28207 if (Subtarget.hasDQI() && VT == MVT::i64 &&
28208 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
28209 assert(!Subtarget.is64Bit() && "i64 should be legal")((!Subtarget.is64Bit() && "i64 should be legal") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28209, __PRETTY_FUNCTION__))
;
28210 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
28211 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
28212 // TODO: Use 128-bit vectors for f64 case?
28213 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
28214 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
28215 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
28216
28217 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
28218 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
28219 DAG.getConstantFP(0.0, dl, VecInVT), Src,
28220 ZeroIdx);
28221 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
28222 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
28223 Results.push_back(Res);
28224 return;
28225 }
28226
28227 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
28228 Results.push_back(V);
28229 return;
28230 }
28231 case ISD::SINT_TO_FP: {
28232 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL!") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28232, __PRETTY_FUNCTION__))
;
28233 SDValue Src = N->getOperand(0);
28234 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
28235 return;
28236 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
28237 return;
28238 }
28239 case ISD::UINT_TO_FP: {
28240 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28240, __PRETTY_FUNCTION__))
;
28241 EVT VT = N->getValueType(0);
28242 if (VT != MVT::v2f32)
28243 return;
28244 SDValue Src = N->getOperand(0);
28245 EVT SrcVT = Src.getValueType();
28246 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
28247 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
28248 return;
28249 }
28250 if (SrcVT != MVT::v2i32)
28251 return;
28252 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
28253 SDValue VBias =
28254 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
28255 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
28256 DAG.getBitcast(MVT::v2i64, VBias));
28257 Or = DAG.getBitcast(MVT::v2f64, Or);
28258 // TODO: Are there any fast-math-flags to propagate here?
28259 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
28260 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
28261 return;
28262 }
28263 case ISD::FP_ROUND: {
28264 if (!isTypeLegal(N->getOperand(0).getValueType()))
28265 return;
28266 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
28267 Results.push_back(V);
28268 return;
28269 }
28270 case ISD::FP_EXTEND: {
28271 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
28272 // No other ValueType for FP_EXTEND should reach this point.
28273 assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28274, __PRETTY_FUNCTION__))
28274 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28274, __PRETTY_FUNCTION__))
;
28275 return;
28276 }
28277 case ISD::INTRINSIC_W_CHAIN: {
28278 unsigned IntNo = N->getConstantOperandVal(1);
28279 switch (IntNo) {
28280 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28281)
28281 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28281)
;
28282 case Intrinsic::x86_rdtsc:
28283 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
28284 Results);
28285 case Intrinsic::x86_rdtscp:
28286 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
28287 Results);
28288 case Intrinsic::x86_rdpmc:
28289 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
28290 Results);
28291 return;
28292 case Intrinsic::x86_xgetbv:
28293 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
28294 Results);
28295 return;
28296 }
28297 }
28298 case ISD::READCYCLECOUNTER: {
28299 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
28300 }
28301 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
28302 EVT T = N->getValueType(0);
28303 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28303, __PRETTY_FUNCTION__))
;
28304 bool Regs64bit = T == MVT::i128;
28305 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28306, __PRETTY_FUNCTION__))
28306 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28306, __PRETTY_FUNCTION__))
;
28307 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
28308 SDValue cpInL, cpInH;
28309 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
28310 DAG.getConstant(0, dl, HalfT));
28311 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
28312 DAG.getConstant(1, dl, HalfT));
28313 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
28314 Regs64bit ? X86::RAX : X86::EAX,
28315 cpInL, SDValue());
28316 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
28317 Regs64bit ? X86::RDX : X86::EDX,
28318 cpInH, cpInL.getValue(1));
28319 SDValue swapInL, swapInH;
28320 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
28321 DAG.getConstant(0, dl, HalfT));
28322 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
28323 DAG.getConstant(1, dl, HalfT));
28324 swapInH =
28325 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
28326 swapInH, cpInH.getValue(1));
28327 // If the current function needs the base pointer, RBX,
28328 // we shouldn't use cmpxchg directly.
28329 // Indeed the lowering of that instruction will clobber
28330 // that register and since RBX will be a reserved register
28331 // the register allocator will not make sure its value will
28332 // be properly saved and restored around this live-range.
28333 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28334 SDValue Result;
28335 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28336 Register BasePtr = TRI->getBaseRegister();
28337 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
28338 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
28339 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
28340 // ISel prefers the LCMPXCHG64 variant.
28341 // If that assert breaks, that means it is not the case anymore,
28342 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
28343 // not just EBX. This is a matter of accepting i64 input for that
28344 // pseudo, and restoring into the register of the right wide
28345 // in expand pseudo. Everything else should just work.
28346 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28347, __PRETTY_FUNCTION__))
28347 "Saving only half of the RBX")((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28347, __PRETTY_FUNCTION__))
;
28348 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
28349 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
28350 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
28351 Regs64bit ? X86::RBX : X86::EBX,
28352 HalfT, swapInH.getValue(1));
28353 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
28354 RBXSave,
28355 /*Glue*/ RBXSave.getValue(2)};
28356 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
28357 } else {
28358 unsigned Opcode =
28359 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
28360 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
28361 Regs64bit ? X86::RBX : X86::EBX, swapInL,
28362 swapInH.getValue(1));
28363 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
28364 swapInL.getValue(1)};
28365 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
28366 }
28367 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
28368 Regs64bit ? X86::RAX : X86::EAX,
28369 HalfT, Result.getValue(1));
28370 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
28371 Regs64bit ? X86::RDX : X86::EDX,
28372 HalfT, cpOutL.getValue(2));
28373 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
28374
28375 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
28376 MVT::i32, cpOutH.getValue(2));
28377 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
28378 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
28379
28380 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
28381 Results.push_back(Success);
28382 Results.push_back(EFLAGS.getValue(1));
28383 return;
28384 }
28385 case ISD::ATOMIC_LOAD: {
28386 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28386, __PRETTY_FUNCTION__))
;
28387 bool NoImplicitFloatOps =
28388 DAG.getMachineFunction().getFunction().hasFnAttribute(
28389 Attribute::NoImplicitFloat);
28390 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
28391 auto *Node = cast<AtomicSDNode>(N);
28392 if (Subtarget.hasSSE2()) {
28393 // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
28394 // lower 64-bits.
28395 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
28396 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
28397 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
28398 MVT::i64, Node->getMemOperand());
28399 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
28400 DAG.getIntPtrConstant(0, dl));
28401 Results.push_back(Res);
28402 Results.push_back(Ld.getValue(1));
28403 return;
28404 }
28405 if (Subtarget.hasX87()) {
28406 // First load this into an 80-bit X87 register. This will put the whole
28407 // integer into the significand.
28408 // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
28409 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
28410 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
28411 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
28412 dl, Tys, Ops, MVT::i64,
28413 Node->getMemOperand());
28414 SDValue Chain = Result.getValue(1);
28415 SDValue InFlag = Result.getValue(2);
28416
28417 // Now store the X87 register to a stack temporary and convert to i64.
28418 // This store is not atomic and doesn't need to be.
28419 // FIXME: We don't need a stack temporary if the result of the load
28420 // is already being stored. We could just directly store there.
28421 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
28422 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28423 MachinePointerInfo MPI =
28424 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28425 SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
28426 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
28427 DAG.getVTList(MVT::Other), StoreOps,
28428 MVT::i64, MPI, 0 /*Align*/,
28429 MachineMemOperand::MOStore);
28430
28431 // Finally load the value back from the stack temporary and return it.
28432 // This load is not atomic and doesn't need to be.
28433 // This load will be further type legalized.
28434 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
28435 Results.push_back(Result);
28436 Results.push_back(Result.getValue(1));
28437 return;
28438 }
28439 }
28440 // TODO: Use MOVLPS when SSE1 is available?
28441 // Delegate to generic TypeLegalization. Situations we can really handle
28442 // should have already been dealt with by AtomicExpandPass.cpp.
28443 break;
28444 }
28445 case ISD::ATOMIC_SWAP:
28446 case ISD::ATOMIC_LOAD_ADD:
28447 case ISD::ATOMIC_LOAD_SUB:
28448 case ISD::ATOMIC_LOAD_AND:
28449 case ISD::ATOMIC_LOAD_OR:
28450 case ISD::ATOMIC_LOAD_XOR:
28451 case ISD::ATOMIC_LOAD_NAND:
28452 case ISD::ATOMIC_LOAD_MIN:
28453 case ISD::ATOMIC_LOAD_MAX:
28454 case ISD::ATOMIC_LOAD_UMIN:
28455 case ISD::ATOMIC_LOAD_UMAX:
28456 // Delegate to generic TypeLegalization. Situations we can really handle
28457 // should have already been dealt with by AtomicExpandPass.cpp.
28458 break;
28459
28460 case ISD::BITCAST: {
28461 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28461, __PRETTY_FUNCTION__))
;
28462 EVT DstVT = N->getValueType(0);
28463 EVT SrcVT = N->getOperand(0).getValueType();
28464
28465 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
28466 // we can split using the k-register rather than memory.
28467 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
28468 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28468, __PRETTY_FUNCTION__))
;
28469 SDValue Lo, Hi;
28470 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28471 Lo = DAG.getBitcast(MVT::i32, Lo);
28472 Hi = DAG.getBitcast(MVT::i32, Hi);
28473 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
28474 Results.push_back(Res);
28475 return;
28476 }
28477
28478 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
28479 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
28480 SrcVT.isVector() && isTypeLegal(SrcVT)) {
28481 SDValue Lo, Hi;
28482 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28483 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
28484 Lo = DAG.getBitcast(CastVT, Lo);
28485 Hi = DAG.getBitcast(CastVT, Hi);
28486 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
28487 Results.push_back(Res);
28488 return;
28489 }
28490
28491 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
28492 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28493, __PRETTY_FUNCTION__))
28493 "Unexpected type action!")((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28493, __PRETTY_FUNCTION__))
;
28494 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
28495 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
28496 Results.push_back(Res);
28497 return;
28498 }
28499
28500 return;
28501 }
28502 case ISD::MGATHER: {
28503 EVT VT = N->getValueType(0);
28504 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
28505 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
28506 auto *Gather = cast<MaskedGatherSDNode>(N);
28507 SDValue Index = Gather->getIndex();
28508 if (Index.getValueType() != MVT::v2i64)
28509 return;
28510 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28511, __PRETTY_FUNCTION__))
28511 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28511, __PRETTY_FUNCTION__))
;
28512 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
28513 SDValue Mask = Gather->getMask();
28514 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28514, __PRETTY_FUNCTION__))
;
28515 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
28516 Gather->getPassThru(),
28517 DAG.getUNDEF(VT));
28518 if (!Subtarget.hasVLX()) {
28519 // We need to widen the mask, but the instruction will only use 2
28520 // of its elements. So we can use undef.
28521 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
28522 DAG.getUNDEF(MVT::v2i1));
28523 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
28524 }
28525 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
28526 Gather->getBasePtr(), Index, Gather->getScale() };
28527 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
28528 DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
28529 Gather->getMemoryVT(), Gather->getMemOperand());
28530 Results.push_back(Res);
28531 Results.push_back(Res.getValue(2));
28532 return;
28533 }
28534 return;
28535 }
28536 case ISD::LOAD: {
28537 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
28538 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
28539 // cast since type legalization will try to use an i64 load.
28540 MVT VT = N->getSimpleValueType(0);
28541 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((VT.isVector() && VT.getSizeInBits() == 64 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28541, __PRETTY_FUNCTION__))
;
28542 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28543, __PRETTY_FUNCTION__))
28543 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28543, __PRETTY_FUNCTION__))
;
28544 if (!ISD::isNON_EXTLoad(N))
28545 return;
28546 auto *Ld = cast<LoadSDNode>(N);
28547 if (Subtarget.hasSSE2()) {
28548 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
28549 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
28550 Ld->getPointerInfo(), Ld->getAlignment(),
28551 Ld->getMemOperand()->getFlags());
28552 SDValue Chain = Res.getValue(1);
28553 MVT VecVT = MVT::getVectorVT(LdVT, 2);
28554 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
28555 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
28556 Res = DAG.getBitcast(WideVT, Res);
28557 Results.push_back(Res);
28558 Results.push_back(Chain);
28559 return;
28560 }
28561 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28561, __PRETTY_FUNCTION__))
;
28562 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
28563 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
28564 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
28565 MVT::i64, Ld->getMemOperand());
28566 Results.push_back(Res);
28567 Results.push_back(Res.getValue(1));
28568 return;
28569 }
28570 }
28571}
28572
28573const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
28574 switch ((X86ISD::NodeType)Opcode) {
28575 case X86ISD::FIRST_NUMBER: break;
28576 case X86ISD::BSF: return "X86ISD::BSF";
28577 case X86ISD::BSR: return "X86ISD::BSR";
28578 case X86ISD::SHLD: return "X86ISD::SHLD";
28579 case X86ISD::SHRD: return "X86ISD::SHRD";
28580 case X86ISD::FAND: return "X86ISD::FAND";
28581 case X86ISD::FANDN: return "X86ISD::FANDN";
28582 case X86ISD::FOR: return "X86ISD::FOR";
28583 case X86ISD::FXOR: return "X86ISD::FXOR";
28584 case X86ISD::FILD: return "X86ISD::FILD";
28585 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
28586 case X86ISD::FIST: return "X86ISD::FIST";
28587 case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
28588 case X86ISD::FLD: return "X86ISD::FLD";
28589 case X86ISD::FST: return "X86ISD::FST";
28590 case X86ISD::CALL: return "X86ISD::CALL";
28591 case X86ISD::BT: return "X86ISD::BT";
28592 case X86ISD::CMP: return "X86ISD::CMP";
28593 case X86ISD::COMI: return "X86ISD::COMI";
28594 case X86ISD::UCOMI: return "X86ISD::UCOMI";
28595 case X86ISD::CMPM: return "X86ISD::CMPM";
28596 case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
28597 case X86ISD::SETCC: return "X86ISD::SETCC";
28598 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
28599 case X86ISD::FSETCC: return "X86ISD::FSETCC";
28600 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
28601 case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
28602 case X86ISD::CMOV: return "X86ISD::CMOV";
28603 case X86ISD::BRCOND: return "X86ISD::BRCOND";
28604 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
28605 case X86ISD::IRET: return "X86ISD::IRET";
28606 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
28607 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
28608 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
28609 case X86ISD::Wrapper: return "X86ISD::Wrapper";
28610 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
28611 case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
28612 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
28613 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
28614 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
28615 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
28616 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
28617 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
28618 case X86ISD::PINSRB: return "X86ISD::PINSRB";
28619 case X86ISD::PINSRW: return "X86ISD::PINSRW";
28620 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
28621 case X86ISD::ANDNP: return "X86ISD::ANDNP";
28622 case X86ISD::BLENDI: return "X86ISD::BLENDI";
28623 case X86ISD::BLENDV: return "X86ISD::BLENDV";
28624 case X86ISD::HADD: return "X86ISD::HADD";
28625 case X86ISD::HSUB: return "X86ISD::HSUB";
28626 case X86ISD::FHADD: return "X86ISD::FHADD";
28627 case X86ISD::FHSUB: return "X86ISD::FHSUB";
28628 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
28629 case X86ISD::FMAX: return "X86ISD::FMAX";
28630 case X86ISD::FMAXS: return "X86ISD::FMAXS";
28631 case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
28632 case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
28633 case X86ISD::FMIN: return "X86ISD::FMIN";
28634 case X86ISD::FMINS: return "X86ISD::FMINS";
28635 case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
28636 case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
28637 case X86ISD::FMAXC: return "X86ISD::FMAXC";
28638 case X86ISD::FMINC: return "X86ISD::FMINC";
28639 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
28640 case X86ISD::FRCP: return "X86ISD::FRCP";
28641 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
28642 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
28643 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
28644 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
28645 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
28646 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
28647 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
28648 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
28649 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
28650 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
28651 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
28652 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
28653 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
28654 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
28655 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
28656 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
28657 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
28658 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
28659 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
28660 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
28661 case X86ISD::LADD: return "X86ISD::LADD";
28662 case X86ISD::LSUB: return "X86ISD::LSUB";
28663 case X86ISD::LOR: return "X86ISD::LOR";
28664 case X86ISD::LXOR: return "X86ISD::LXOR";
28665 case X86ISD::LAND: return "X86ISD::LAND";
28666 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
28667 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
28668 case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
28669 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
28670 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
28671 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
28672 case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
28673 case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
28674 case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
28675 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
28676 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
28677 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
28678 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
28679 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
28680 case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
28681 case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
28682 case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
28683 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
28684 case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
28685 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
28686 case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
28687 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
28688 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
28689 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
28690 case X86ISD::VSHL: return "X86ISD::VSHL";
28691 case X86ISD::VSRL: return "X86ISD::VSRL";
28692 case X86ISD::VSRA: return "X86ISD::VSRA";
28693 case X86ISD::VSHLI: return "X86ISD::VSHLI";
28694 case X86ISD::VSRLI: return "X86ISD::VSRLI";
28695 case X86ISD::VSRAI: return "X86ISD::VSRAI";
28696 case X86ISD::VSHLV: return "X86ISD::VSHLV";
28697 case X86ISD::VSRLV: return "X86ISD::VSRLV";
28698 case X86ISD::VSRAV: return "X86ISD::VSRAV";
28699 case X86ISD::VROTLI: return "X86ISD::VROTLI";
28700 case X86ISD::VROTRI: return "X86ISD::VROTRI";
28701 case X86ISD::VPPERM: return "X86ISD::VPPERM";
28702 case X86ISD::CMPP: return "X86ISD::CMPP";
28703 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
28704 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
28705 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
28706 case X86ISD::ADD: return "X86ISD::ADD";
28707 case X86ISD::SUB: return "X86ISD::SUB";
28708 case X86ISD::ADC: return "X86ISD::ADC";
28709 case X86ISD::SBB: return "X86ISD::SBB";
28710 case X86ISD::SMUL: return "X86ISD::SMUL";
28711 case X86ISD::UMUL: return "X86ISD::UMUL";
28712 case X86ISD::OR: return "X86ISD::OR";
28713 case X86ISD::XOR: return "X86ISD::XOR";
28714 case X86ISD::AND: return "X86ISD::AND";
28715 case X86ISD::BEXTR: return "X86ISD::BEXTR";
28716 case X86ISD::BZHI: return "X86ISD::BZHI";
28717 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
28718 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
28719 case X86ISD::PTEST: return "X86ISD::PTEST";
28720 case X86ISD::TESTP: return "X86ISD::TESTP";
28721 case X86ISD::KORTEST: return "X86ISD::KORTEST";
28722 case X86ISD::KTEST: return "X86ISD::KTEST";
28723 case X86ISD::KADD: return "X86ISD::KADD";
28724 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
28725 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
28726 case X86ISD::PACKSS: return "X86ISD::PACKSS";
28727 case X86ISD::PACKUS: return "X86ISD::PACKUS";
28728 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
28729 case X86ISD::VALIGN: return "X86ISD::VALIGN";
28730 case X86ISD::VSHLD: return "X86ISD::VSHLD";
28731 case X86ISD::VSHRD: return "X86ISD::VSHRD";
28732 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
28733 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
28734 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
28735 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
28736 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
28737 case X86ISD::SHUFP: return "X86ISD::SHUFP";
28738 case X86ISD::SHUF128: return "X86ISD::SHUF128";
28739 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
28740 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
28741 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
28742 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
28743 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
28744 case X86ISD::MOVSD: return "X86ISD::MOVSD";
28745 case X86ISD::MOVSS: return "X86ISD::MOVSS";
28746 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
28747 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
28748 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
28749 case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
28750 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
28751 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
28752 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
28753 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
28754 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
28755 case X86ISD::VPERMV: return "X86ISD::VPERMV";
28756 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
28757 case X86ISD::VPERMI: return "X86ISD::VPERMI";
28758 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
28759 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
28760 case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
28761 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
28762 case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
28763 case X86ISD::VRANGE: return "X86ISD::VRANGE";
28764 case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
28765 case X86ISD::VRANGES: return "X86ISD::VRANGES";
28766 case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
28767 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
28768 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
28769 case X86ISD::PSADBW: return "X86ISD::PSADBW";
28770 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
28771 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
28772 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
28773 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
28774 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
28775 case X86ISD::MFENCE: return "X86ISD::MFENCE";
28776 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
28777 case X86ISD::SAHF: return "X86ISD::SAHF";
28778 case X86ISD::RDRAND: return "X86ISD::RDRAND";
28779 case X86ISD::RDSEED: return "X86ISD::RDSEED";
28780 case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
28781 case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
28782 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
28783 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
28784 case X86ISD::VPSHA: return "X86ISD::VPSHA";
28785 case X86ISD::VPSHL: return "X86ISD::VPSHL";
28786 case X86ISD::VPCOM: return "X86ISD::VPCOM";
28787 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
28788 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
28789 case X86ISD::FMSUB: return "X86ISD::FMSUB";
28790 case X86ISD::FNMADD: return "X86ISD::FNMADD";
28791 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
28792 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
28793 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
28794 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
28795 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
28796 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
28797 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
28798 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
28799 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
28800 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
28801 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
28802 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
28803 case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
28804 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
28805 case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
28806 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
28807 case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
28808 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
28809 case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
28810 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
28811 case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
28812 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
28813 case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
28814 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
28815 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
28816 case X86ISD::XTEST: return "X86ISD::XTEST";
28817 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
28818 case X86ISD::EXPAND: return "X86ISD::EXPAND";
28819 case X86ISD::SELECTS: return "X86ISD::SELECTS";
28820 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
28821 case X86ISD::RCP14: return "X86ISD::RCP14";
28822 case X86ISD::RCP14S: return "X86ISD::RCP14S";
28823 case X86ISD::RCP28: return "X86ISD::RCP28";
28824 case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
28825 case X86ISD::RCP28S: return "X86ISD::RCP28S";
28826 case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
28827 case X86ISD::EXP2: return "X86ISD::EXP2";
28828 case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
28829 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
28830 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
28831 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
28832 case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
28833 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
28834 case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
28835 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
28836 case X86ISD::FADDS: return "X86ISD::FADDS";
28837 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
28838 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
28839 case X86ISD::FSUBS: return "X86ISD::FSUBS";
28840 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
28841 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
28842 case X86ISD::FMULS: return "X86ISD::FMULS";
28843 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
28844 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
28845 case X86ISD::FDIVS: return "X86ISD::FDIVS";
28846 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
28847 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
28848 case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
28849 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
28850 case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
28851 case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
28852 case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
28853 case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
28854 case X86ISD::SCALEF: return "X86ISD::SCALEF";
28855 case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
28856 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
28857 case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
28858 case X86ISD::AVG: return "X86ISD::AVG";
28859 case X86ISD::MULHRS: return "X86ISD::MULHRS";
28860 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
28861 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
28862 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
28863 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
28864 case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
28865 case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
28866 case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
28867 case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
28868 case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
28869 case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
28870 case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
28871 case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
28872 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
28873 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
28874 case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
28875 case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
28876 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
28877 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
28878 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
28879 case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
28880 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
28881 case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
28882 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
28883 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
28884 case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
28885 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
28886 case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
28887 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
28888 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
28889 case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
28890 case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
28891 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
28892 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
28893 case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
28894 case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
28895 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
28896 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
28897 case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
28898 case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
28899 case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
28900 case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
28901 case X86ISD::LWPINS: return "X86ISD::LWPINS";
28902 case X86ISD::MGATHER: return "X86ISD::MGATHER";
28903 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
28904 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
28905 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
28906 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
28907 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
28908 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
28909 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
28910 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
28911 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
28912 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
28913 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
28914 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
28915 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
28916 case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
28917 case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
28918 case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
28919 }
28920 return nullptr;
28921}
28922
28923/// Return true if the addressing mode represented by AM is legal for this
28924/// target, for a load/store of the specified type.
28925bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
28926 const AddrMode &AM, Type *Ty,
28927 unsigned AS,
28928 Instruction *I) const {
28929 // X86 supports extremely general addressing modes.
28930 CodeModel::Model M = getTargetMachine().getCodeModel();
28931
28932 // X86 allows a sign-extended 32-bit immediate field as a displacement.
28933 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
28934 return false;
28935
28936 if (AM.BaseGV) {
28937 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
28938
28939 // If a reference to this global requires an extra load, we can't fold it.
28940 if (isGlobalStubReference(GVFlags))
28941 return false;
28942
28943 // If BaseGV requires a register for the PIC base, we cannot also have a
28944 // BaseReg specified.
28945 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
28946 return false;
28947
28948 // If lower 4G is not available, then we must use rip-relative addressing.
28949 if ((M != CodeModel::Small || isPositionIndependent()) &&
28950 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
28951 return false;
28952 }
28953
28954 switch (AM.Scale) {
28955 case 0:
28956 case 1:
28957 case 2:
28958 case 4:
28959 case 8:
28960 // These scales always work.
28961 break;
28962 case 3:
28963 case 5:
28964 case 9:
28965 // These scales are formed with basereg+scalereg. Only accept if there is
28966 // no basereg yet.
28967 if (AM.HasBaseReg)
28968 return false;
28969 break;
28970 default: // Other stuff never works.
28971 return false;
28972 }
28973
28974 return true;
28975}
28976
28977bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
28978 unsigned Bits = Ty->getScalarSizeInBits();
28979
28980 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
28981 // particularly cheaper than those without.
28982 if (Bits == 8)
28983 return false;
28984
28985 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
28986 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
28987 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
28988 return false;
28989
28990 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
28991 // shifts just as cheap as scalar ones.
28992 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
28993 return false;
28994
28995 // AVX512BW has shifts such as vpsllvw.
28996 if (Subtarget.hasBWI() && Bits == 16)
28997 return false;
28998
28999 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
29000 // fully general vector.
29001 return true;
29002}
29003
29004bool X86TargetLowering::isBinOp(unsigned Opcode) const {
29005 switch (Opcode) {
29006 // These are non-commutative binops.
29007 // TODO: Add more X86ISD opcodes once we have test coverage.
29008 case X86ISD::ANDNP:
29009 case X86ISD::PCMPGT:
29010 case X86ISD::FMAX:
29011 case X86ISD::FMIN:
29012 case X86ISD::FANDN:
29013 return true;
29014 }
29015
29016 return TargetLoweringBase::isBinOp(Opcode);
29017}
29018
29019bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
29020 switch (Opcode) {
29021 // TODO: Add more X86ISD opcodes once we have test coverage.
29022 case X86ISD::PCMPEQ:
29023 case X86ISD::PMULDQ:
29024 case X86ISD::PMULUDQ:
29025 case X86ISD::FMAXC:
29026 case X86ISD::FMINC:
29027 case X86ISD::FAND:
29028 case X86ISD::FOR:
29029 case X86ISD::FXOR:
29030 return true;
29031 }
29032
29033 return TargetLoweringBase::isCommutativeBinOp(Opcode);
29034}
29035
29036bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
29037 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
29038 return false;
29039 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
29040 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
29041 return NumBits1 > NumBits2;
29042}
29043
29044bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
29045 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
29046 return false;
29047
29048 if (!isTypeLegal(EVT::getEVT(Ty1)))
29049 return false;
29050
29051 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29051, __PRETTY_FUNCTION__))
;
29052
29053 // Assuming the caller doesn't have a zeroext or signext return parameter,
29054 // truncation all the way down to i1 is valid.
29055 return true;
29056}
29057
29058bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
29059 return isInt<32>(Imm);
29060}
29061
29062bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
29063 // Can also use sub to handle negated immediates.
29064 return isInt<32>(Imm);
29065}
29066
29067bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
29068 return isInt<32>(Imm);
29069}
29070
29071bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
29072 if (!VT1.isInteger() || !VT2.isInteger())
29073 return false;
29074 unsigned NumBits1 = VT1.getSizeInBits();
29075 unsigned NumBits2 = VT2.getSizeInBits();
29076 return NumBits1 > NumBits2;
29077}
29078
29079bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
29080 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
29081 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
29082}
29083
29084bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
29085 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
29086 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
29087}
29088
29089bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
29090 EVT VT1 = Val.getValueType();
29091 if (isZExtFree(VT1, VT2))
29092 return true;
29093
29094 if (Val.getOpcode() != ISD::LOAD)
29095 return false;
29096
29097 if (!VT1.isSimple() || !VT1.isInteger() ||
29098 !VT2.isSimple() || !VT2.isInteger())
29099 return false;
29100
29101 switch (VT1.getSimpleVT().SimpleTy) {
29102 default: break;
29103 case MVT::i8:
29104 case MVT::i16:
29105 case MVT::i32:
29106 // X86 has 8, 16, and 32-bit zero-extending loads.
29107 return true;
29108 }
29109
29110 return false;
29111}
29112
29113bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
29114 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
29115 return false;
29116
29117 EVT SrcVT = ExtVal.getOperand(0).getValueType();
29118
29119 // There is no extending load for vXi1.
29120 if (SrcVT.getScalarType() == MVT::i1)
29121 return false;
29122
29123 return true;
29124}
29125
29126bool
29127X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
29128 if (!Subtarget.hasAnyFMA())
29129 return false;
29130
29131 VT = VT.getScalarType();
29132
29133 if (!VT.isSimple())
29134 return false;
29135
29136 switch (VT.getSimpleVT().SimpleTy) {
29137 case MVT::f32:
29138 case MVT::f64:
29139 return true;
29140 default:
29141 break;
29142 }
29143
29144 return false;
29145}
29146
29147bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
29148 // i16 instructions are longer (0x66 prefix) and potentially slower.
29149 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
29150}
29151
29152/// Targets can use this to indicate that they only support *some*
29153/// VECTOR_SHUFFLE operations, those with specific masks.
29154/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
29155/// are assumed to be legal.
29156bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
29157 if (!VT.isSimple())
29158 return false;
29159
29160 // Not for i1 vectors
29161 if (VT.getSimpleVT().getScalarType() == MVT::i1)
29162 return false;
29163
29164 // Very little shuffling can be done for 64-bit vectors right now.
29165 if (VT.getSimpleVT().getSizeInBits() == 64)
29166 return false;
29167
29168 // We only care that the types being shuffled are legal. The lowering can
29169 // handle any possible shuffle mask that results.
29170 return isTypeLegal(VT.getSimpleVT());
29171}
29172
29173bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
29174 EVT VT) const {
29175 // Don't convert an 'and' into a shuffle that we don't directly support.
29176 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
29177 if (!Subtarget.hasAVX2())
29178 if (VT == MVT::v32i8 || VT == MVT::v16i16)
29179 return false;
29180
29181 // Just delegate to the generic legality, clear masks aren't special.
29182 return isShuffleMaskLegal(Mask, VT);
29183}
29184
29185bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
29186 // If the subtarget is using retpolines, we need to not generate jump tables.
29187 if (Subtarget.useRetpolineIndirectBranches())
29188 return false;
29189
29190 // Otherwise, fallback on the generic logic.
29191 return TargetLowering::areJTsAllowed(Fn);
29192}
29193
29194//===----------------------------------------------------------------------===//
29195// X86 Scheduler Hooks
29196//===----------------------------------------------------------------------===//
29197
29198/// Utility function to emit xbegin specifying the start of an RTM region.
29199static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
29200 const TargetInstrInfo *TII) {
29201 DebugLoc DL = MI.getDebugLoc();
29202
29203 const BasicBlock *BB = MBB->getBasicBlock();
29204 MachineFunction::iterator I = ++MBB->getIterator();
29205
29206 // For the v = xbegin(), we generate
29207 //
29208 // thisMBB:
29209 // xbegin sinkMBB
29210 //
29211 // mainMBB:
29212 // s0 = -1
29213 //
29214 // fallBB:
29215 // eax = # XABORT_DEF
29216 // s1 = eax
29217 //
29218 // sinkMBB:
29219 // v = phi(s0/mainBB, s1/fallBB)
29220
29221 MachineBasicBlock *thisMBB = MBB;
29222 MachineFunction *MF = MBB->getParent();
29223 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
29224 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
29225 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
29226 MF->insert(I, mainMBB);
29227 MF->insert(I, fallMBB);
29228 MF->insert(I, sinkMBB);
29229
29230 // Transfer the remainder of BB and its successor edges to sinkMBB.
29231 sinkMBB->splice(sinkMBB->begin(), MBB,
29232 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
29233 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
29234
29235 MachineRegisterInfo &MRI = MF->getRegInfo();
29236 Register DstReg = MI.getOperand(0).getReg();
29237 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
29238 Register mainDstReg = MRI.createVirtualRegister(RC);
29239 Register fallDstReg = MRI.createVirtualRegister(RC);
29240
29241 // thisMBB:
29242 // xbegin fallMBB
29243 // # fallthrough to mainMBB
29244 // # abortion to fallMBB
29245 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
29246 thisMBB->addSuccessor(mainMBB);
29247 thisMBB->addSuccessor(fallMBB);
29248
29249 // mainMBB:
29250 // mainDstReg := -1
29251 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
29252 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
29253 mainMBB->addSuccessor(sinkMBB);
29254
29255 // fallMBB:
29256 // ; pseudo instruction to model hardware's definition from XABORT
29257 // EAX := XABORT_DEF
29258 // fallDstReg := EAX
29259 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
29260 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
29261 .addReg(X86::EAX);
29262 fallMBB->addSuccessor(sinkMBB);
29263
29264 // sinkMBB:
29265 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
29266 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
29267 .addReg(mainDstReg).addMBB(mainMBB)
29268 .addReg(fallDstReg).addMBB(fallMBB);
29269
29270 MI.eraseFromParent();
29271 return sinkMBB;
29272}
29273
29274
29275
29276MachineBasicBlock *
29277X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
29278 MachineBasicBlock *MBB) const {
29279 // Emit va_arg instruction on X86-64.
29280
29281 // Operands to this pseudo-instruction:
29282 // 0 ) Output : destination address (reg)
29283 // 1-5) Input : va_list address (addr, i64mem)
29284 // 6 ) ArgSize : Size (in bytes) of vararg type
29285 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
29286 // 8 ) Align : Alignment of type
29287 // 9 ) EFLAGS (implicit-def)
29288
29289 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29289, __PRETTY_FUNCTION__))
;
29290 static_assert(X86::AddrNumOperands == 5,
29291 "VAARG_64 assumes 5 address operands");
29292
29293 Register DestReg = MI.getOperand(0).getReg();
29294 MachineOperand &Base = MI.getOperand(1);
29295 MachineOperand &Scale = MI.getOperand(2);
29296 MachineOperand &Index = MI.getOperand(3);
29297 MachineOperand &Disp = MI.getOperand(4);
29298 MachineOperand &Segment = MI.getOperand(5);
29299 unsigned ArgSize = MI.getOperand(6).getImm();
29300 unsigned ArgMode = MI.getOperand(7).getImm();
29301 unsigned Align = MI.getOperand(8).getImm();
29302
29303 MachineFunction *MF = MBB->getParent();
29304
29305 // Memory Reference
29306 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29306, __PRETTY_FUNCTION__))
;
29307
29308 MachineMemOperand *OldMMO = MI.memoperands().front();
29309
29310 // Clone the MMO into two separate MMOs for loading and storing
29311 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
29312 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
29313 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
29314 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
29315
29316 // Machine Information
29317 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29318 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
29319 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
29320 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
29321 DebugLoc DL = MI.getDebugLoc();
29322
29323 // struct va_list {
29324 // i32 gp_offset
29325 // i32 fp_offset
29326 // i64 overflow_area (address)
29327 // i64 reg_save_area (address)
29328 // }
29329 // sizeof(va_list) = 24
29330 // alignment(va_list) = 8
29331
29332 unsigned TotalNumIntRegs = 6;
29333 unsigned TotalNumXMMRegs = 8;
29334 bool UseGPOffset = (ArgMode == 1);
29335 bool UseFPOffset = (ArgMode == 2);
29336 unsigned MaxOffset = TotalNumIntRegs * 8 +
29337 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
29338
29339 /* Align ArgSize to a multiple of 8 */
29340 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
29341 bool NeedsAlign = (Align > 8);
29342
29343 MachineBasicBlock *thisMBB = MBB;
29344 MachineBasicBlock *overflowMBB;
29345 MachineBasicBlock *offsetMBB;
29346 MachineBasicBlock *endMBB;
29347
29348 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
29349 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
29350 unsigned OffsetReg = 0;
29351
29352 if (!UseGPOffset && !UseFPOffset) {
29353 // If we only pull from the overflow region, we don't create a branch.
29354 // We don't need to alter control flow.
29355 OffsetDestReg = 0; // unused
29356 OverflowDestReg = DestReg;
29357
29358 offsetMBB = nullptr;
29359 overflowMBB = thisMBB;
29360 endMBB = thisMBB;
29361 } else {
29362 // First emit code to check if gp_offset (or fp_offset) is below the bound.
29363 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
29364 // If not, pull from overflow_area. (branch to overflowMBB)
29365 //
29366 // thisMBB
29367 // | .
29368 // | .
29369 // offsetMBB overflowMBB
29370 // | .
29371 // | .
29372 // endMBB
29373
29374 // Registers for the PHI in endMBB
29375 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
29376 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
29377
29378 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
29379 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
29380 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
29381 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
29382
29383 MachineFunction::iterator MBBIter = ++MBB->getIterator();
29384
29385 // Insert the new basic blocks
29386 MF->insert(MBBIter, offsetMBB);
29387 MF->insert(MBBIter, overflowMBB);
29388 MF->insert(MBBIter, endMBB);
29389
29390 // Transfer the remainder of MBB and its successor edges to endMBB.
29391 endMBB->splice(endMBB->begin(), thisMBB,
29392 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
29393 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
29394
29395 // Make offsetMBB and overflowMBB successors of thisMBB
29396 thisMBB->addSuccessor(offsetMBB);
29397 thisMBB->addSuccessor(overflowMBB);
29398
29399 // endMBB is a successor of both offsetMBB and overflowMBB
29400 offsetMBB->addSuccessor(endMBB);
29401 overflowMBB->addSuccessor(endMBB);
29402
29403 // Load the offset value into a register
29404 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
29405 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
29406 .add(Base)
29407 .add(Scale)
29408 .add(Index)
29409 .addDisp(Disp, UseFPOffset ? 4 : 0)
29410 .add(Segment)
29411 .setMemRefs(LoadOnlyMMO);
29412
29413 // Check if there is enough room left to pull this argument.
29414 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
29415 .addReg(OffsetReg)
29416 .addImm(MaxOffset + 8 - ArgSizeA8);
29417
29418 // Branch to "overflowMBB" if offset >= max
29419 // Fall through to "offsetMBB" otherwise
29420 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
29421 .addMBB(overflowMBB).addImm(X86::COND_AE);
29422 }
29423
29424 // In offsetMBB, emit code to use the reg_save_area.
29425 if (offsetMBB) {
29426 assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29426, __PRETTY_FUNCTION__))
;
29427
29428 // Read the reg_save_area address.
29429 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
29430 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
29431 .add(Base)
29432 .add(Scale)
29433 .add(Index)
29434 .addDisp(Disp, 16)
29435 .add(Segment)
29436 .setMemRefs(LoadOnlyMMO);
29437
29438 // Zero-extend the offset
29439 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
29440 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
29441 .addImm(0)
29442 .addReg(OffsetReg)
29443 .addImm(X86::sub_32bit);
29444
29445 // Add the offset to the reg_save_area to get the final address.
29446 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
29447 .addReg(OffsetReg64)
29448 .addReg(RegSaveReg);
29449
29450 // Compute the offset for the next argument
29451 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
29452 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
29453 .addReg(OffsetReg)
29454 .addImm(UseFPOffset ? 16 : 8);
29455
29456 // Store it back into the va_list.
29457 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
29458 .add(Base)
29459 .add(Scale)
29460 .add(Index)
29461 .addDisp(Disp, UseFPOffset ? 4 : 0)
29462 .add(Segment)
29463 .addReg(NextOffsetReg)
29464 .setMemRefs(StoreOnlyMMO);
29465
29466 // Jump to endMBB
29467 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
29468 .addMBB(endMBB);
29469 }
29470
29471 //
29472 // Emit code to use overflow area
29473 //
29474
29475 // Load the overflow_area address into a register.
29476 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
29477 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
29478 .add(Base)
29479 .add(Scale)
29480 .add(Index)
29481 .addDisp(Disp, 8)
29482 .add(Segment)
29483 .setMemRefs(LoadOnlyMMO);
29484
29485 // If we need to align it, do so. Otherwise, just copy the address
29486 // to OverflowDestReg.
29487 if (NeedsAlign) {
29488 // Align the overflow address
29489 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")((isPowerOf2_32(Align) && "Alignment must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29489, __PRETTY_FUNCTION__))
;
29490 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
29491
29492 // aligned_addr = (addr + (align-1)) & ~(align-1)
29493 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
29494 .addReg(OverflowAddrReg)
29495 .addImm(Align-1);
29496
29497 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
29498 .addReg(TmpReg)
29499 .addImm(~(uint64_t)(Align-1));
29500 } else {
29501 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
29502 .addReg(OverflowAddrReg);
29503 }
29504
29505 // Compute the next overflow address after this argument.
29506 // (the overflow address should be kept 8-byte aligned)
29507 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
29508 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
29509 .addReg(OverflowDestReg)
29510 .addImm(ArgSizeA8);
29511
29512 // Store the new overflow address.
29513 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
29514 .add(Base)
29515 .add(Scale)
29516 .add(Index)
29517 .addDisp(Disp, 8)
29518 .add(Segment)
29519 .addReg(NextAddrReg)
29520 .setMemRefs(StoreOnlyMMO);
29521
29522 // If we branched, emit the PHI to the front of endMBB.
29523 if (offsetMBB) {
29524 BuildMI(*endMBB, endMBB->begin(), DL,
29525 TII->get(X86::PHI), DestReg)
29526 .addReg(OffsetDestReg).addMBB(offsetMBB)
29527 .addReg(OverflowDestReg).addMBB(overflowMBB);
29528 }
29529
29530 // Erase the pseudo instruction
29531 MI.eraseFromParent();
29532
29533 return endMBB;
29534}
29535
29536MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
29537 MachineInstr &MI, MachineBasicBlock *MBB) const {
29538 // Emit code to save XMM registers to the stack. The ABI says that the
29539 // number of registers to save is given in %al, so it's theoretically
29540 // possible to do an indirect jump trick to avoid saving all of them,
29541 // however this code takes a simpler approach and just executes all
29542 // of the stores if %al is non-zero. It's less code, and it's probably
29543 // easier on the hardware branch predictor, and stores aren't all that
29544 // expensive anyway.
29545
29546 // Create the new basic blocks. One block contains all the XMM stores,
29547 // and one block is the final destination regardless of whether any
29548 // stores were performed.
29549 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
29550 MachineFunction *F = MBB->getParent();
29551 MachineFunction::iterator MBBIter = ++MBB->getIterator();
29552 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
29553 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
29554 F->insert(MBBIter, XMMSaveMBB);
29555 F->insert(MBBIter, EndMBB);
29556
29557 // Transfer the remainder of MBB and its successor edges to EndMBB.
29558 EndMBB->splice(EndMBB->begin(), MBB,
29559 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
29560 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
29561
29562 // The original block will now fall through to the XMM save block.
29563 MBB->addSuccessor(XMMSaveMBB);
29564 // The XMMSaveMBB will fall through to the end block.
29565 XMMSaveMBB->addSuccessor(EndMBB);
29566
29567 // Now add the instructions.
29568 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29569 DebugLoc DL = MI.getDebugLoc();
29570
29571 Register CountReg = MI.getOperand(0).getReg();
29572 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
29573 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
29574
29575 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
29576 // If %al is 0, branch around the XMM save block.
29577 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
29578 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
29579 MBB->addSuccessor(EndMBB);
29580 }
29581
29582 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
29583 // that was just emitted, but clearly shouldn't be "saved".
29584 assert((MI.getNumOperands() <= 3 ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29587, __PRETTY_FUNCTION__))
29585 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29587, __PRETTY_FUNCTION__))
29586 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29587, __PRETTY_FUNCTION__))
29587 "Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29587, __PRETTY_FUNCTION__))
;
29588 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
29589 // In the XMM save block, save all the XMM argument registers.
29590 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
29591 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
29592 MachineMemOperand *MMO = F->getMachineMemOperand(
29593 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
29594 MachineMemOperand::MOStore,
29595 /*Size=*/16, /*Align=*/16);
29596 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
29597 .addFrameIndex(RegSaveFrameIndex)
29598 .addImm(/*Scale=*/1)
29599 .addReg(/*IndexReg=*/0)
29600 .addImm(/*Disp=*/Offset)
29601 .addReg(/*Segment=*/0)
29602 .addReg(MI.getOperand(i).getReg())
29603 .addMemOperand(MMO);
29604 }
29605
29606 MI.eraseFromParent(); // The pseudo instruction is gone now.
29607
29608 return EndMBB;
29609}
29610
29611// The EFLAGS operand of SelectItr might be missing a kill marker
29612// because there were multiple uses of EFLAGS, and ISel didn't know
29613// which to mark. Figure out whether SelectItr should have had a
29614// kill marker, and set it if it should. Returns the correct kill
29615// marker value.
29616static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
29617 MachineBasicBlock* BB,
29618 const TargetRegisterInfo* TRI) {
29619 // Scan forward through BB for a use/def of EFLAGS.
29620 MachineBasicBlock::iterator miI(std::next(SelectItr));
29621 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
29622 const MachineInstr& mi = *miI;
29623 if (mi.readsRegister(X86::EFLAGS))
29624 return false;
29625 if (mi.definesRegister(X86::EFLAGS))
29626 break; // Should have kill-flag - update below.
29627 }
29628
29629 // If we hit the end of the block, check whether EFLAGS is live into a
29630 // successor.
29631 if (miI == BB->end()) {
29632 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
29633 sEnd = BB->succ_end();
29634 sItr != sEnd; ++sItr) {
29635 MachineBasicBlock* succ = *sItr;
29636 if (succ->isLiveIn(X86::EFLAGS))
29637 return false;
29638 }
29639 }
29640
29641 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
29642 // out. SelectMI should have a kill flag on EFLAGS.
29643 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
29644 return true;
29645}
29646
29647// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
29648// together with other CMOV pseudo-opcodes into a single basic-block with
29649// conditional jump around it.
29650static bool isCMOVPseudo(MachineInstr &MI) {
29651 switch (MI.getOpcode()) {
29652 case X86::CMOV_FR32:
29653 case X86::CMOV_FR32X:
29654 case X86::CMOV_FR64:
29655 case X86::CMOV_FR64X:
29656 case X86::CMOV_GR8:
29657 case X86::CMOV_GR16:
29658 case X86::CMOV_GR32:
29659 case X86::CMOV_RFP32:
29660 case X86::CMOV_RFP64:
29661 case X86::CMOV_RFP80:
29662 case X86::CMOV_VR128:
29663 case X86::CMOV_VR128X:
29664 case X86::CMOV_VR256:
29665 case X86::CMOV_VR256X:
29666 case X86::CMOV_VR512:
29667 case X86::CMOV_VK2:
29668 case X86::CMOV_VK4:
29669 case X86::CMOV_VK8:
29670 case X86::CMOV_VK16:
29671 case X86::CMOV_VK32:
29672 case X86::CMOV_VK64:
29673 return true;
29674
29675 default:
29676 return false;
29677 }
29678}
29679
29680// Helper function, which inserts PHI functions into SinkMBB:
29681// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
29682// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
29683// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
29684// the last PHI function inserted.
29685static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
29686 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
29687 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
29688 MachineBasicBlock *SinkMBB) {
29689 MachineFunction *MF = TrueMBB->getParent();
29690 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
29691 DebugLoc DL = MIItBegin->getDebugLoc();
29692
29693 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
29694 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
29695
29696 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
29697
29698 // As we are creating the PHIs, we have to be careful if there is more than
29699 // one. Later CMOVs may reference the results of earlier CMOVs, but later
29700 // PHIs have to reference the individual true/false inputs from earlier PHIs.
29701 // That also means that PHI construction must work forward from earlier to
29702 // later, and that the code must maintain a mapping from earlier PHI's
29703 // destination registers, and the registers that went into the PHI.
29704 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
29705 MachineInstrBuilder MIB;
29706
29707 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
29708 Register DestReg = MIIt->getOperand(0).getReg();
29709 Register Op1Reg = MIIt->getOperand(1).getReg();
29710 Register Op2Reg = MIIt->getOperand(2).getReg();
29711
29712 // If this CMOV we are generating is the opposite condition from
29713 // the jump we generated, then we have to swap the operands for the
29714 // PHI that is going to be generated.
29715 if (MIIt->getOperand(3).getImm() == OppCC)
29716 std::swap(Op1Reg, Op2Reg);
29717
29718 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
29719 Op1Reg = RegRewriteTable[Op1Reg].first;
29720
29721 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
29722 Op2Reg = RegRewriteTable[Op2Reg].second;
29723
29724 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
29725 .addReg(Op1Reg)
29726 .addMBB(FalseMBB)
29727 .addReg(Op2Reg)
29728 .addMBB(TrueMBB);
29729
29730 // Add this PHI to the rewrite table.
29731 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
29732 }
29733
29734 return MIB;
29735}
29736
29737// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
29738MachineBasicBlock *
29739X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
29740 MachineInstr &SecondCascadedCMOV,
29741 MachineBasicBlock *ThisMBB) const {
29742 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29743 DebugLoc DL = FirstCMOV.getDebugLoc();
29744
29745 // We lower cascaded CMOVs such as
29746 //
29747 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
29748 //
29749 // to two successive branches.
29750 //
29751 // Without this, we would add a PHI between the two jumps, which ends up
29752 // creating a few copies all around. For instance, for
29753 //
29754 // (sitofp (zext (fcmp une)))
29755 //
29756 // we would generate:
29757 //
29758 // ucomiss %xmm1, %xmm0
29759 // movss <1.0f>, %xmm0
29760 // movaps %xmm0, %xmm1
29761 // jne .LBB5_2
29762 // xorps %xmm1, %xmm1
29763 // .LBB5_2:
29764 // jp .LBB5_4
29765 // movaps %xmm1, %xmm0
29766 // .LBB5_4:
29767 // retq
29768 //
29769 // because this custom-inserter would have generated:
29770 //
29771 // A
29772 // | \
29773 // | B
29774 // | /
29775 // C
29776 // | \
29777 // | D
29778 // | /
29779 // E
29780 //
29781 // A: X = ...; Y = ...
29782 // B: empty
29783 // C: Z = PHI [X, A], [Y, B]
29784 // D: empty
29785 // E: PHI [X, C], [Z, D]
29786 //
29787 // If we lower both CMOVs in a single step, we can instead generate:
29788 //
29789 // A
29790 // | \
29791 // | C
29792 // | /|
29793 // |/ |
29794 // | |
29795 // | D
29796 // | /
29797 // E
29798 //
29799 // A: X = ...; Y = ...
29800 // D: empty
29801 // E: PHI [X, A], [X, C], [Y, D]
29802 //
29803 // Which, in our sitofp/fcmp example, gives us something like:
29804 //
29805 // ucomiss %xmm1, %xmm0
29806 // movss <1.0f>, %xmm0
29807 // jne .LBB5_4
29808 // jp .LBB5_4
29809 // xorps %xmm0, %xmm0
29810 // .LBB5_4:
29811 // retq
29812 //
29813
29814 // We lower cascaded CMOV into two successive branches to the same block.
29815 // EFLAGS is used by both, so mark it as live in the second.
29816 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
29817 MachineFunction *F = ThisMBB->getParent();
29818 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
29819 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
29820 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
29821
29822 MachineFunction::iterator It = ++ThisMBB->getIterator();
29823 F->insert(It, FirstInsertedMBB);
29824 F->insert(It, SecondInsertedMBB);
29825 F->insert(It, SinkMBB);
29826
29827 // For a cascaded CMOV, we lower it to two successive branches to
29828 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
29829 // the FirstInsertedMBB.
29830 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
29831
29832 // If the EFLAGS register isn't dead in the terminator, then claim that it's
29833 // live into the sink and copy blocks.
29834 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29835 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
29836 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
29837 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
29838 SinkMBB->addLiveIn(X86::EFLAGS);
29839 }
29840
29841 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
29842 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
29843 std::next(MachineBasicBlock::iterator(FirstCMOV)),
29844 ThisMBB->end());
29845 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
29846
29847 // Fallthrough block for ThisMBB.
29848 ThisMBB->addSuccessor(FirstInsertedMBB);
29849 // The true block target of the first branch is always SinkMBB.
29850 ThisMBB->addSuccessor(SinkMBB);
29851 // Fallthrough block for FirstInsertedMBB.
29852 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
29853 // The true block for the branch of FirstInsertedMBB.
29854 FirstInsertedMBB->addSuccessor(SinkMBB);
29855 // This is fallthrough.
29856 SecondInsertedMBB->addSuccessor(SinkMBB);
29857
29858 // Create the conditional branch instructions.
29859 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
29860 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
29861
29862 X86::CondCode SecondCC =
29863 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
29864 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
29865
29866 // SinkMBB:
29867 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
29868 Register DestReg = FirstCMOV.getOperand(0).getReg();
29869 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
29870 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
29871 MachineInstrBuilder MIB =
29872 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
29873 .addReg(Op1Reg)
29874 .addMBB(SecondInsertedMBB)
29875 .addReg(Op2Reg)
29876 .addMBB(ThisMBB);
29877
29878 // The second SecondInsertedMBB provides the same incoming value as the
29879 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
29880 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
29881 // Copy the PHI result to the register defined by the second CMOV.
29882 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
29883 TII->get(TargetOpcode::COPY),
29884 SecondCascadedCMOV.getOperand(0).getReg())
29885 .addReg(FirstCMOV.getOperand(0).getReg());
29886
29887 // Now remove the CMOVs.
29888 FirstCMOV.eraseFromParent();
29889 SecondCascadedCMOV.eraseFromParent();
29890
29891 return SinkMBB;
29892}
29893
29894MachineBasicBlock *
29895X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
29896 MachineBasicBlock *ThisMBB) const {
29897 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
29898 DebugLoc DL = MI.getDebugLoc();
29899
29900 // To "insert" a SELECT_CC instruction, we actually have to insert the
29901 // diamond control-flow pattern. The incoming instruction knows the
29902 // destination vreg to set, the condition code register to branch on, the
29903 // true/false values to select between and a branch opcode to use.
29904
29905 // ThisMBB:
29906 // ...
29907 // TrueVal = ...
29908 // cmpTY ccX, r1, r2
29909 // bCC copy1MBB
29910 // fallthrough --> FalseMBB
29911
29912 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
29913 // as described above, by inserting a BB, and then making a PHI at the join
29914 // point to select the true and false operands of the CMOV in the PHI.
29915 //
29916 // The code also handles two different cases of multiple CMOV opcodes
29917 // in a row.
29918 //
29919 // Case 1:
29920 // In this case, there are multiple CMOVs in a row, all which are based on
29921 // the same condition setting (or the exact opposite condition setting).
29922 // In this case we can lower all the CMOVs using a single inserted BB, and
29923 // then make a number of PHIs at the join point to model the CMOVs. The only
29924 // trickiness here, is that in a case like:
29925 //
29926 // t2 = CMOV cond1 t1, f1
29927 // t3 = CMOV cond1 t2, f2
29928 //
29929 // when rewriting this into PHIs, we have to perform some renaming on the
29930 // temps since you cannot have a PHI operand refer to a PHI result earlier
29931 // in the same block. The "simple" but wrong lowering would be:
29932 //
29933 // t2 = PHI t1(BB1), f1(BB2)
29934 // t3 = PHI t2(BB1), f2(BB2)
29935 //
29936 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
29937 // renaming is to note that on the path through BB1, t2 is really just a
29938 // copy of t1, and do that renaming, properly generating:
29939 //
29940 // t2 = PHI t1(BB1), f1(BB2)
29941 // t3 = PHI t1(BB1), f2(BB2)
29942 //
29943 // Case 2:
29944 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
29945 // function - EmitLoweredCascadedSelect.
29946
29947 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
29948 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
29949 MachineInstr *LastCMOV = &MI;
29950 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
29951
29952 // Check for case 1, where there are multiple CMOVs with the same condition
29953 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
29954 // number of jumps the most.
29955
29956 if (isCMOVPseudo(MI)) {
29957 // See if we have a string of CMOVS with the same condition. Skip over
29958 // intervening debug insts.
29959 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
29960 (NextMIIt->getOperand(3).getImm() == CC ||
29961 NextMIIt->getOperand(3).getImm() == OppCC)) {
29962 LastCMOV = &*NextMIIt;
29963 ++NextMIIt;
29964 NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
29965 }
29966 }
29967
29968 // This checks for case 2, but only do this if we didn't already find
29969 // case 1, as indicated by LastCMOV == MI.
29970 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
29971 NextMIIt->getOpcode() == MI.getOpcode() &&
29972 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
29973 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
29974 NextMIIt->getOperand(1).isKill()) {
29975 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
29976 }
29977
29978 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
29979 MachineFunction *F = ThisMBB->getParent();
29980 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
29981 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
29982
29983 MachineFunction::iterator It = ++ThisMBB->getIterator();
29984 F->insert(It, FalseMBB);
29985 F->insert(It, SinkMBB);
29986
29987 // If the EFLAGS register isn't dead in the terminator, then claim that it's
29988 // live into the sink and copy blocks.
29989 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29990 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
29991 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
29992 FalseMBB->addLiveIn(X86::EFLAGS);
29993 SinkMBB->addLiveIn(X86::EFLAGS);
29994 }
29995
29996 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
29997 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
29998 auto DbgIt = MachineBasicBlock::iterator(MI);
29999 while (DbgIt != DbgEnd) {
30000 auto Next = std::next(DbgIt);
30001 if (DbgIt->isDebugInstr())
30002 SinkMBB->push_back(DbgIt->removeFromParent());
30003 DbgIt = Next;
30004 }
30005
30006 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
30007 SinkMBB->splice(SinkMBB->end(), ThisMBB,
30008 std::next(MachineBasicBlock::iterator(LastCMOV)),
30009 ThisMBB->end());
30010 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
30011
30012 // Fallthrough block for ThisMBB.
30013 ThisMBB->addSuccessor(FalseMBB);
30014 // The true block target of the first (or only) branch is always a SinkMBB.
30015 ThisMBB->addSuccessor(SinkMBB);
30016 // Fallthrough block for FalseMBB.
30017 FalseMBB->addSuccessor(SinkMBB);
30018
30019 // Create the conditional branch instruction.
30020 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
30021
30022 // SinkMBB:
30023 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
30024 // ...
30025 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
30026 MachineBasicBlock::iterator MIItEnd =
30027 std::next(MachineBasicBlock::iterator(LastCMOV));
30028 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
30029
30030 // Now remove the CMOV(s).
30031 ThisMBB->erase(MIItBegin, MIItEnd);
30032
30033 return SinkMBB;
30034}
30035
30036MachineBasicBlock *
30037X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
30038 MachineBasicBlock *BB) const {
30039 MachineFunction *MF = BB->getParent();
30040 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30041 DebugLoc DL = MI.getDebugLoc();
30042 const BasicBlock *LLVM_BB = BB->getBasicBlock();
30043
30044 assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30044, __PRETTY_FUNCTION__))
;
30045
30046 const bool Is64Bit = Subtarget.is64Bit();
30047 const bool IsLP64 = Subtarget.isTarget64BitLP64();
30048
30049 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
30050 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
30051
30052 // BB:
30053 // ... [Till the alloca]
30054 // If stacklet is not large enough, jump to mallocMBB
30055 //
30056 // bumpMBB:
30057 // Allocate by subtracting from RSP
30058 // Jump to continueMBB
30059 //
30060 // mallocMBB:
30061 // Allocate by call to runtime
30062 //
30063 // continueMBB:
30064 // ...
30065 // [rest of original BB]
30066 //
30067
30068 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30069 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30070 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30071
30072 MachineRegisterInfo &MRI = MF->getRegInfo();
30073 const TargetRegisterClass *AddrRegClass =
30074 getRegClassFor(getPointerTy(MF->getDataLayout()));
30075
30076 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
30077 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
30078 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
30079 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
30080 sizeVReg = MI.getOperand(1).getReg(),
30081 physSPReg =
30082 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
30083
30084 MachineFunction::iterator MBBIter = ++BB->getIterator();
30085
30086 MF->insert(MBBIter, bumpMBB);
30087 MF->insert(MBBIter, mallocMBB);
30088 MF->insert(MBBIter, continueMBB);
30089
30090 continueMBB->splice(continueMBB->begin(), BB,
30091 std::next(MachineBasicBlock::iterator(MI)), BB->end());
30092 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
30093
30094 // Add code to the main basic block to check if the stack limit has been hit,
30095 // and if so, jump to mallocMBB otherwise to bumpMBB.
30096 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
30097 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
30098 .addReg(tmpSPVReg).addReg(sizeVReg);
30099 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
30100 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
30101 .addReg(SPLimitVReg);
30102 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
30103
30104 // bumpMBB simply decreases the stack pointer, since we know the current
30105 // stacklet has enough space.
30106 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
30107 .addReg(SPLimitVReg);
30108 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
30109 .addReg(SPLimitVReg);
30110 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
30111
30112 // Calls into a routine in libgcc to allocate more space from the heap.
30113 const uint32_t *RegMask =
30114 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
30115 if (IsLP64) {
30116 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
30117 .addReg(sizeVReg);
30118 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
30119 .addExternalSymbol("__morestack_allocate_stack_space")
30120 .addRegMask(RegMask)
30121 .addReg(X86::RDI, RegState::Implicit)
30122 .addReg(X86::RAX, RegState::ImplicitDefine);
30123 } else if (Is64Bit) {
30124 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
30125 .addReg(sizeVReg);
30126 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
30127 .addExternalSymbol("__morestack_allocate_stack_space")
30128 .addRegMask(RegMask)
30129 .addReg(X86::EDI, RegState::Implicit)
30130 .addReg(X86::EAX, RegState::ImplicitDefine);
30131 } else {
30132 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
30133 .addImm(12);
30134 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
30135 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
30136 .addExternalSymbol("__morestack_allocate_stack_space")
30137 .addRegMask(RegMask)
30138 .addReg(X86::EAX, RegState::ImplicitDefine);
30139 }
30140
30141 if (!Is64Bit)
30142 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
30143 .addImm(16);
30144
30145 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
30146 .addReg(IsLP64 ? X86::RAX : X86::EAX);
30147 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
30148
30149 // Set up the CFG correctly.
30150 BB->addSuccessor(bumpMBB);
30151 BB->addSuccessor(mallocMBB);
30152 mallocMBB->addSuccessor(continueMBB);
30153 bumpMBB->addSuccessor(continueMBB);
30154
30155 // Take care of the PHI nodes.
30156 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
30157 MI.getOperand(0).getReg())
30158 .addReg(mallocPtrVReg)
30159 .addMBB(mallocMBB)
30160 .addReg(bumpSPPtrVReg)
30161 .addMBB(bumpMBB);
30162
30163 // Delete the original pseudo instruction.
30164 MI.eraseFromParent();
30165
30166 // And we're done.
30167 return continueMBB;
30168}
30169
30170MachineBasicBlock *
30171X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
30172 MachineBasicBlock *BB) const {
30173 MachineFunction *MF = BB->getParent();
30174 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
30175 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
30176 DebugLoc DL = MI.getDebugLoc();
30177
30178 assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30180, __PRETTY_FUNCTION__))
30179 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30180, __PRETTY_FUNCTION__))
30180 "SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30180, __PRETTY_FUNCTION__))
;
30181
30182 // Only 32-bit EH needs to worry about manually restoring stack pointers.
30183 if (!Subtarget.is32Bit())
30184 return BB;
30185
30186 // C++ EH creates a new target block to hold the restore code, and wires up
30187 // the new block to the return destination with a normal JMP_4.
30188 MachineBasicBlock *RestoreMBB =
30189 MF->CreateMachineBasicBlock(BB->getBasicBlock());
30190 assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail
("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30190, __PRETTY_FUNCTION__))
;
30191 MF->insert(std::next(BB->getIterator()), RestoreMBB);
30192 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
30193 BB->addSuccessor(RestoreMBB);
30194 MI.getOperand(0).setMBB(RestoreMBB);
30195
30196 auto RestoreMBBI = RestoreMBB->begin();
30197 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
30198 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
30199 return BB;
30200}
30201
30202MachineBasicBlock *
30203X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
30204 MachineBasicBlock *BB) const {
30205 MachineFunction *MF = BB->getParent();
30206 const Constant *PerFn = MF->getFunction().getPersonalityFn();
30207 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
30208 // Only 32-bit SEH requires special handling for catchpad.
30209 if (IsSEH && Subtarget.is32Bit()) {
30210 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
30211 DebugLoc DL = MI.getDebugLoc();
30212 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
30213 }
30214 MI.eraseFromParent();
30215 return BB;
30216}
30217
30218MachineBasicBlock *
30219X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
30220 MachineBasicBlock *BB) const {
30221 // So, here we replace TLSADDR with the sequence:
30222 // adjust_stackdown -> TLSADDR -> adjust_stackup.
30223 // We need this because TLSADDR is lowered into calls
30224 // inside MC, therefore without the two markers shrink-wrapping
30225 // may push the prologue/epilogue pass them.
30226 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
30227 DebugLoc DL = MI.getDebugLoc();
30228 MachineFunction &MF = *BB->getParent();
30229
30230 // Emit CALLSEQ_START right before the instruction.
30231 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
30232 MachineInstrBuilder CallseqStart =
30233 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
30234 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
30235
30236 // Emit CALLSEQ_END right after the instruction.
30237 // We don't call erase from parent because we want to keep the
30238 // original instruction around.
30239 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
30240 MachineInstrBuilder CallseqEnd =
30241 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
30242 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
30243
30244 return BB;
30245}
30246
30247MachineBasicBlock *
30248X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
30249 MachineBasicBlock *BB) const {
30250 // This is pretty easy. We're taking the value that we received from
30251 // our load from the relocation, sticking it in either RDI (x86-64)
30252 // or EAX and doing an indirect call. The return value will then
30253 // be in the normal return register.
30254 MachineFunction *F = BB->getParent();
30255 const X86InstrInfo *TII = Subtarget.getInstrInfo();
30256 DebugLoc DL = MI.getDebugLoc();
30257
30258 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30258, __PRETTY_FUNCTION__))
;
30259 assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30259, __PRETTY_FUNCTION__))
;
30260
30261 // Get a register mask for the lowered call.
30262 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
30263 // proper register mask.
30264 const uint32_t *RegMask =
30265 Subtarget.is64Bit() ?
30266 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
30267 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
30268 if (Subtarget.is64Bit()) {
30269 MachineInstrBuilder MIB =
30270 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
30271 .addReg(X86::RIP)
30272 .addImm(0)
30273 .addReg(0)
30274 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
30275 MI.getOperand(3).getTargetFlags())
30276 .addReg(0);
30277 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
30278 addDirectMem(MIB, X86::RDI);
30279 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
30280 } else if (!isPositionIndependent()) {
30281 MachineInstrBuilder MIB =
30282 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
30283 .addReg(0)
30284 .addImm(0)
30285 .addReg(0)
30286 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
30287 MI.getOperand(3).getTargetFlags())
30288 .addReg(0);
30289 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
30290 addDirectMem(MIB, X86::EAX);
30291 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
30292 } else {
30293 MachineInstrBuilder MIB =
30294 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
30295 .addReg(TII->getGlobalBaseReg(F))
30296 .addImm(0)
30297 .addReg(0)
30298 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
30299 MI.getOperand(3).getTargetFlags())
30300 .addReg(0);
30301 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
30302 addDirectMem(MIB, X86::EAX);
30303 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
30304 }
30305
30306 MI.eraseFromParent(); // The pseudo instruction is gone now.
30307 return BB;
30308}
30309
30310static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
30311 switch (RPOpc) {
30312 case X86::RETPOLINE_CALL32:
30313 return X86::CALLpcrel32;
30314 case X86::RETPOLINE_CALL64:
30315 return X86::CALL64pcrel32;
30316 case X86::RETPOLINE_TCRETURN32:
30317 return X86::TCRETURNdi;
30318 case X86::RETPOLINE_TCRETURN64:
30319 return X86::TCRETURNdi64;
30320 }
30321 llvm_unreachable("not retpoline opcode")::llvm::llvm_unreachable_internal("not retpoline opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30321)
;
30322}
30323
30324static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
30325 unsigned Reg) {
30326 if (Subtarget.useRetpolineExternalThunk()) {
30327 // When using an external thunk for retpolines, we pick names that match the
30328 // names GCC happens to use as well. This helps simplify the implementation
30329 // of the thunks for kernels where they have no easy ability to create
30330 // aliases and are doing non-trivial configuration of the thunk's body. For
30331 // example, the Linux kernel will do boot-time hot patching of the thunk
30332 // bodies and cannot easily export aliases of these to loaded modules.
30333 //
30334 // Note that at any point in the future, we may need to change the semantics
30335 // of how we implement retpolines and at that time will likely change the
30336 // name of the called thunk. Essentially, there is no hard guarantee that
30337 // LLVM will generate calls to specific thunks, we merely make a best-effort
30338 // attempt to help out kernels and other systems where duplicating the
30339 // thunks is costly.
30340 switch (Reg) {
30341 case X86::EAX:
30342 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30342, __PRETTY_FUNCTION__))
;
30343 return "__x86_indirect_thunk_eax";
30344 case X86::ECX:
30345 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30345, __PRETTY_FUNCTION__))
;
30346 return "__x86_indirect_thunk_ecx";
30347 case X86::EDX:
30348 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30348, __PRETTY_FUNCTION__))
;
30349 return "__x86_indirect_thunk_edx";
30350 case X86::EDI:
30351 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30351, __PRETTY_FUNCTION__))
;
30352 return "__x86_indirect_thunk_edi";
30353 case X86::R11:
30354 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30354, __PRETTY_FUNCTION__))
;
30355 return "__x86_indirect_thunk_r11";
30356 }
30357 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30357)
;
30358 }
30359
30360 // When targeting an internal COMDAT thunk use an LLVM-specific name.
30361 switch (Reg) {
30362 case X86::EAX:
30363 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30363, __PRETTY_FUNCTION__))
;
30364 return "__llvm_retpoline_eax";
30365 case X86::ECX:
30366 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30366, __PRETTY_FUNCTION__))
;
30367 return "__llvm_retpoline_ecx";
30368 case X86::EDX:
30369 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30369, __PRETTY_FUNCTION__))
;
30370 return "__llvm_retpoline_edx";
30371 case X86::EDI:
30372 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30372, __PRETTY_FUNCTION__))
;
30373 return "__llvm_retpoline_edi";
30374 case X86::R11:
30375 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30375, __PRETTY_FUNCTION__))
;
30376 return "__llvm_retpoline_r11";
30377 }
30378 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30378)
;
30379}
30380
30381MachineBasicBlock *
30382X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
30383 MachineBasicBlock *BB) const {
30384 // Copy the virtual register into the R11 physical register and
30385 // call the retpoline thunk.
30386 DebugLoc DL = MI.getDebugLoc();
30387 const X86InstrInfo *TII = Subtarget.getInstrInfo();
30388 Register CalleeVReg = MI.getOperand(0).getReg();
30389 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
30390
30391 // Find an available scratch register to hold the callee. On 64-bit, we can
30392 // just use R11, but we scan for uses anyway to ensure we don't generate
30393 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
30394 // already a register use operand to the call to hold the callee. If none
30395 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
30396 // register and ESI is the base pointer to realigned stack frames with VLAs.
30397 SmallVector<unsigned, 3> AvailableRegs;
30398 if (Subtarget.is64Bit())
30399 AvailableRegs.push_back(X86::R11);
30400 else
30401 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
30402
30403 // Zero out any registers that are already used.
30404 for (const auto &MO : MI.operands()) {
30405 if (MO.isReg() && MO.isUse())
30406 for (unsigned &Reg : AvailableRegs)
30407 if (Reg == MO.getReg())
30408 Reg = 0;
30409 }
30410
30411 // Choose the first remaining non-zero available register.
30412 unsigned AvailableReg = 0;
30413 for (unsigned MaybeReg : AvailableRegs) {
30414 if (MaybeReg) {
30415 AvailableReg = MaybeReg;
30416 break;
30417 }
30418 }
30419 if (!AvailableReg)
30420 report_fatal_error("calling convention incompatible with retpoline, no "
30421 "available registers");
30422
30423 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
30424
30425 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
30426 .addReg(CalleeVReg);
30427 MI.getOperand(0).ChangeToES(Symbol);
30428 MI.setDesc(TII->get(Opc));
30429 MachineInstrBuilder(*BB->getParent(), &MI)
30430 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
30431 return BB;
30432}
30433
30434/// SetJmp implies future control flow change upon calling the corresponding
30435/// LongJmp.
30436/// Instead of using the 'return' instruction, the long jump fixes the stack and
30437/// performs an indirect branch. To do so it uses the registers that were stored
30438/// in the jump buffer (when calling SetJmp).
30439/// In case the shadow stack is enabled we need to fix it as well, because some
30440/// return addresses will be skipped.
30441/// The function will save the SSP for future fixing in the function
30442/// emitLongJmpShadowStackFix.
30443/// \sa emitLongJmpShadowStackFix
30444/// \param [in] MI The temporary Machine Instruction for the builtin.
30445/// \param [in] MBB The Machine Basic Block that will be modified.
30446void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
30447 MachineBasicBlock *MBB) const {
30448 DebugLoc DL = MI.getDebugLoc();
30449 MachineFunction *MF = MBB->getParent();
30450 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30451 MachineRegisterInfo &MRI = MF->getRegInfo();
30452 MachineInstrBuilder MIB;
30453
30454 // Memory Reference.
30455 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
30456 MI.memoperands_end());
30457
30458 // Initialize a register with zero.
30459 MVT PVT = getPointerTy(MF->getDataLayout());
30460 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
30461 Register ZReg = MRI.createVirtualRegister(PtrRC);
30462 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
30463 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
30464 .addDef(ZReg)
30465 .addReg(ZReg, RegState::Undef)
30466 .addReg(ZReg, RegState::Undef);
30467
30468 // Read the current SSP Register value to the zeroed register.
30469 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
30470 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
30471 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
30472
30473 // Write the SSP register value to offset 3 in input memory buffer.
30474 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
30475 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
30476 const int64_t SSPOffset = 3 * PVT.getStoreSize();
30477 const unsigned MemOpndSlot = 1;
30478 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30479 if (i == X86::AddrDisp)
30480 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
30481 else
30482 MIB.add(MI.getOperand(MemOpndSlot + i));
30483 }
30484 MIB.addReg(SSPCopyReg);
30485 MIB.setMemRefs(MMOs);
30486}
30487
30488MachineBasicBlock *
30489X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
30490 MachineBasicBlock *MBB) const {
30491 DebugLoc DL = MI.getDebugLoc();
30492 MachineFunction *MF = MBB->getParent();
30493 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30494 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
30495 MachineRegisterInfo &MRI = MF->getRegInfo();
30496
30497 const BasicBlock *BB = MBB->getBasicBlock();
30498 MachineFunction::iterator I = ++MBB->getIterator();
30499
30500 // Memory Reference
30501 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
30502 MI.memoperands_end());
30503
30504 unsigned DstReg;
30505 unsigned MemOpndSlot = 0;
30506
30507 unsigned CurOp = 0;
30508
30509 DstReg = MI.getOperand(CurOp++).getReg();
30510 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
30511 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"
) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30511, __PRETTY_FUNCTION__))
;
30512 (void)TRI;
30513 Register mainDstReg = MRI.createVirtualRegister(RC);
30514 Register restoreDstReg = MRI.createVirtualRegister(RC);
30515
30516 MemOpndSlot = CurOp;
30517
30518 MVT PVT = getPointerTy(MF->getDataLayout());
30519 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30520, __PRETTY_FUNCTION__))
30520 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30520, __PRETTY_FUNCTION__))
;
30521
30522 // For v = setjmp(buf), we generate
30523 //
30524 // thisMBB:
30525 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
30526 // SjLjSetup restoreMBB
30527 //
30528 // mainMBB:
30529 // v_main = 0
30530 //
30531 // sinkMBB:
30532 // v = phi(main, restore)
30533 //
30534 // restoreMBB:
30535 // if base pointer being used, load it from frame
30536 // v_restore = 1
30537
30538 MachineBasicBlock *thisMBB = MBB;
30539 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
30540 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
30541 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
30542 MF->insert(I, mainMBB);
30543 MF->insert(I, sinkMBB);
30544 MF->push_back(restoreMBB);
30545 restoreMBB->setHasAddressTaken();
30546
30547 MachineInstrBuilder MIB;
30548
30549 // Transfer the remainder of BB and its successor edges to sinkMBB.
30550 sinkMBB->splice(sinkMBB->begin(), MBB,
30551 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
30552 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
30553
30554 // thisMBB:
30555 unsigned PtrStoreOpc = 0;
30556 unsigned LabelReg = 0;
30557 const int64_t LabelOffset = 1 * PVT.getStoreSize();
30558 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
30559 !isPositionIndependent();
30560
30561 // Prepare IP either in reg or imm.
30562 if (!UseImmLabel) {
30563 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
30564 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
30565 LabelReg = MRI.createVirtualRegister(PtrRC);
30566 if (Subtarget.is64Bit()) {
30567 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
30568 .addReg(X86::RIP)
30569 .addImm(0)
30570 .addReg(0)
30571 .addMBB(restoreMBB)
30572 .addReg(0);
30573 } else {
30574 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
30575 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
30576 .addReg(XII->getGlobalBaseReg(MF))
30577 .addImm(0)
30578 .addReg(0)
30579 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
30580 .addReg(0);
30581 }
30582 } else
30583 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
30584 // Store IP
30585 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
30586 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30587 if (i == X86::AddrDisp)
30588 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
30589 else
30590 MIB.add(MI.getOperand(MemOpndSlot + i));
30591 }
30592 if (!UseImmLabel)
30593 MIB.addReg(LabelReg);
30594 else
30595 MIB.addMBB(restoreMBB);
30596 MIB.setMemRefs(MMOs);
30597
30598 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
30599 emitSetJmpShadowStackFix(MI, thisMBB);
30600 }
30601
30602 // Setup
30603 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
30604 .addMBB(restoreMBB);
30605
30606 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
30607 MIB.addRegMask(RegInfo->getNoPreservedMask());
30608 thisMBB->addSuccessor(mainMBB);
30609 thisMBB->addSuccessor(restoreMBB);
30610
30611 // mainMBB:
30612 // EAX = 0
30613 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
30614 mainMBB->addSuccessor(sinkMBB);
30615
30616 // sinkMBB:
30617 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
30618 TII->get(X86::PHI), DstReg)
30619 .addReg(mainDstReg).addMBB(mainMBB)
30620 .addReg(restoreDstReg).addMBB(restoreMBB);
30621
30622 // restoreMBB:
30623 if (RegInfo->hasBasePointer(*MF)) {
30624 const bool Uses64BitFramePtr =
30625 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
30626 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
30627 X86FI->setRestoreBasePointer(MF);
30628 Register FramePtr = RegInfo->getFrameRegister(*MF);
30629 Register BasePtr = RegInfo->getBaseRegister();
30630 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
30631 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
30632 FramePtr, true, X86FI->getRestoreBasePointerOffset())
30633 .setMIFlag(MachineInstr::FrameSetup);
30634 }
30635 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
30636 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
30637 restoreMBB->addSuccessor(sinkMBB);
30638
30639 MI.eraseFromParent();
30640 return sinkMBB;
30641}
30642
30643/// Fix the shadow stack using the previously saved SSP pointer.
30644/// \sa emitSetJmpShadowStackFix
30645/// \param [in] MI The temporary Machine Instruction for the builtin.
30646/// \param [in] MBB The Machine Basic Block that will be modified.
30647/// \return The sink MBB that will perform the future indirect branch.
30648MachineBasicBlock *
30649X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
30650 MachineBasicBlock *MBB) const {
30651 DebugLoc DL = MI.getDebugLoc();
30652 MachineFunction *MF = MBB->getParent();
30653 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30654 MachineRegisterInfo &MRI = MF->getRegInfo();
30655
30656 // Memory Reference
30657 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
30658 MI.memoperands_end());
30659
30660 MVT PVT = getPointerTy(MF->getDataLayout());
30661 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
30662
30663 // checkSspMBB:
30664 // xor vreg1, vreg1
30665 // rdssp vreg1
30666 // test vreg1, vreg1
30667 // je sinkMBB # Jump if Shadow Stack is not supported
30668 // fallMBB:
30669 // mov buf+24/12(%rip), vreg2
30670 // sub vreg1, vreg2
30671 // jbe sinkMBB # No need to fix the Shadow Stack
30672 // fixShadowMBB:
30673 // shr 3/2, vreg2
30674 // incssp vreg2 # fix the SSP according to the lower 8 bits
30675 // shr 8, vreg2
30676 // je sinkMBB
30677 // fixShadowLoopPrepareMBB:
30678 // shl vreg2
30679 // mov 128, vreg3
30680 // fixShadowLoopMBB:
30681 // incssp vreg3
30682 // dec vreg2
30683 // jne fixShadowLoopMBB # Iterate until you finish fixing
30684 // # the Shadow Stack
30685 // sinkMBB:
30686
30687 MachineFunction::iterator I = ++MBB->getIterator();
30688 const BasicBlock *BB = MBB->getBasicBlock();
30689
30690 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
30691 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
30692 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
30693 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
30694 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
30695 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
30696 MF->insert(I, checkSspMBB);
30697 MF->insert(I, fallMBB);
30698 MF->insert(I, fixShadowMBB);
30699 MF->insert(I, fixShadowLoopPrepareMBB);
30700 MF->insert(I, fixShadowLoopMBB);
30701 MF->insert(I, sinkMBB);
30702
30703 // Transfer the remainder of BB and its successor edges to sinkMBB.
30704 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
30705 MBB->end());
30706 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
30707
30708 MBB->addSuccessor(checkSspMBB);
30709
30710 // Initialize a register with zero.
30711 Register ZReg = MRI.createVirtualRegister(PtrRC);
30712 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
30713 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
30714 .addDef(ZReg)
30715 .addReg(ZReg, RegState::Undef)
30716 .addReg(ZReg, RegState::Undef);
30717
30718 // Read the current SSP Register value to the zeroed register.
30719 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
30720 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
30721 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
30722
30723 // Check whether the result of the SSP register is zero and jump directly
30724 // to the sink.
30725 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
30726 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
30727 .addReg(SSPCopyReg)
30728 .addReg(SSPCopyReg);
30729 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
30730 checkSspMBB->addSuccessor(sinkMBB);
30731 checkSspMBB->addSuccessor(fallMBB);
30732
30733 // Reload the previously saved SSP register value.
30734 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
30735 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
30736 const int64_t SPPOffset = 3 * PVT.getStoreSize();
30737 MachineInstrBuilder MIB =
30738 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
30739 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30740 const MachineOperand &MO = MI.getOperand(i);
30741 if (i == X86::AddrDisp)
30742 MIB.addDisp(MO, SPPOffset);
30743 else if (MO.isReg()) // Don't add the whole operand, we don't want to
30744 // preserve kill flags.
30745 MIB.addReg(MO.getReg());
30746 else
30747 MIB.add(MO);
30748 }
30749 MIB.setMemRefs(MMOs);
30750
30751 // Subtract the current SSP from the previous SSP.
30752 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
30753 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
30754 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
30755 .addReg(PrevSSPReg)
30756 .addReg(SSPCopyReg);
30757
30758 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
30759 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
30760 fallMBB->addSuccessor(sinkMBB);
30761 fallMBB->addSuccessor(fixShadowMBB);
30762
30763 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
30764 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
30765 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
30766 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
30767 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
30768 .addReg(SspSubReg)
30769 .addImm(Offset);
30770
30771 // Increase SSP when looking only on the lower 8 bits of the delta.
30772 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
30773 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
30774
30775 // Reset the lower 8 bits.
30776 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
30777 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
30778 .addReg(SspFirstShrReg)
30779 .addImm(8);
30780
30781 // Jump if the result of the shift is zero.
30782 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
30783 fixShadowMBB->addSuccessor(sinkMBB);
30784 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
30785
30786 // Do a single shift left.
30787 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
30788 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
30789 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
30790 .addReg(SspSecondShrReg);
30791
30792 // Save the value 128 to a register (will be used next with incssp).
30793 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
30794 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
30795 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
30796 .addImm(128);
30797 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
30798
30799 // Since incssp only looks at the lower 8 bits, we might need to do several
30800 // iterations of incssp until we finish fixing the shadow stack.
30801 Register DecReg = MRI.createVirtualRegister(PtrRC);
30802 Register CounterReg = MRI.createVirtualRegister(PtrRC);
30803 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
30804 .addReg(SspAfterShlReg)
30805 .addMBB(fixShadowLoopPrepareMBB)
30806 .addReg(DecReg)
30807 .addMBB(fixShadowLoopMBB);
30808
30809 // Every iteration we increase the SSP by 128.
30810 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
30811
30812 // Every iteration we decrement the counter by 1.
30813 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
30814 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
30815
30816 // Jump if the counter is not zero yet.
30817 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
30818 fixShadowLoopMBB->addSuccessor(sinkMBB);
30819 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
30820
30821 return sinkMBB;
30822}
30823
30824MachineBasicBlock *
30825X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
30826 MachineBasicBlock *MBB) const {
30827 DebugLoc DL = MI.getDebugLoc();
30828 MachineFunction *MF = MBB->getParent();
30829 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30830 MachineRegisterInfo &MRI = MF->getRegInfo();
30831
30832 // Memory Reference
30833 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
30834 MI.memoperands_end());
30835
30836 MVT PVT = getPointerTy(MF->getDataLayout());
30837 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30838, __PRETTY_FUNCTION__))
30838 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30838, __PRETTY_FUNCTION__))
;
30839
30840 const TargetRegisterClass *RC =
30841 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
30842 Register Tmp = MRI.createVirtualRegister(RC);
30843 // Since FP is only updated here but NOT referenced, it's treated as GPR.
30844 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
30845 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
30846 Register SP = RegInfo->getStackRegister();
30847
30848 MachineInstrBuilder MIB;
30849
30850 const int64_t LabelOffset = 1 * PVT.getStoreSize();
30851 const int64_t SPOffset = 2 * PVT.getStoreSize();
30852
30853 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
30854 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
30855
30856 MachineBasicBlock *thisMBB = MBB;
30857
30858 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
30859 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
30860 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
30861 }
30862
30863 // Reload FP
30864 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
30865 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30866 const MachineOperand &MO = MI.getOperand(i);
30867 if (MO.isReg()) // Don't add the whole operand, we don't want to
30868 // preserve kill flags.
30869 MIB.addReg(MO.getReg());
30870 else
30871 MIB.add(MO);
30872 }
30873 MIB.setMemRefs(MMOs);
30874
30875 // Reload IP
30876 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
30877 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30878 const MachineOperand &MO = MI.getOperand(i);
30879 if (i == X86::AddrDisp)
30880 MIB.addDisp(MO, LabelOffset);
30881 else if (MO.isReg()) // Don't add the whole operand, we don't want to
30882 // preserve kill flags.
30883 MIB.addReg(MO.getReg());
30884 else
30885 MIB.add(MO);
30886 }
30887 MIB.setMemRefs(MMOs);
30888
30889 // Reload SP
30890 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
30891 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
30892 if (i == X86::AddrDisp)
30893 MIB.addDisp(MI.getOperand(i), SPOffset);
30894 else
30895 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
30896 // the last instruction of the expansion.
30897 }
30898 MIB.setMemRefs(MMOs);
30899
30900 // Jump
30901 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
30902
30903 MI.eraseFromParent();
30904 return thisMBB;
30905}
30906
30907void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
30908 MachineBasicBlock *MBB,
30909 MachineBasicBlock *DispatchBB,
30910 int FI) const {
30911 DebugLoc DL = MI.getDebugLoc();
30912 MachineFunction *MF = MBB->getParent();
30913 MachineRegisterInfo *MRI = &MF->getRegInfo();
30914 const X86InstrInfo *TII = Subtarget.getInstrInfo();
30915
30916 MVT PVT = getPointerTy(MF->getDataLayout());
30917 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30917, __PRETTY_FUNCTION__))
;
30918
30919 unsigned Op = 0;
30920 unsigned VR = 0;
30921
30922 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
30923 !isPositionIndependent();
30924
30925 if (UseImmLabel) {
30926 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
30927 } else {
30928 const TargetRegisterClass *TRC =
30929 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
30930 VR = MRI->createVirtualRegister(TRC);
30931 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
30932
30933 if (Subtarget.is64Bit())
30934 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
30935 .addReg(X86::RIP)
30936 .addImm(1)
30937 .addReg(0)
30938 .addMBB(DispatchBB)
30939 .addReg(0);
30940 else
30941 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
30942 .addReg(0) /* TII->getGlobalBaseReg(MF) */
30943 .addImm(1)
30944 .addReg(0)
30945 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
30946 .addReg(0);
30947 }
30948
30949 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
30950 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
30951 if (UseImmLabel)
30952 MIB.addMBB(DispatchBB);
30953 else
30954 MIB.addReg(VR);
30955}
30956
30957MachineBasicBlock *
30958X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
30959 MachineBasicBlock *BB) const {
30960 DebugLoc DL = MI.getDebugLoc();
30961 MachineFunction *MF = BB->getParent();
30962 MachineRegisterInfo *MRI = &MF->getRegInfo();
30963 const X86InstrInfo *TII = Subtarget.getInstrInfo();
30964 int FI = MF->getFrameInfo().getFunctionContextIndex();
30965
30966 // Get a mapping of the call site numbers to all of the landing pads they're
30967 // associated with.
30968 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
30969 unsigned MaxCSNum = 0;
30970 for (auto &MBB : *MF) {
30971 if (!MBB.isEHPad())
30972 continue;
30973
30974 MCSymbol *Sym = nullptr;
30975 for (const auto &MI : MBB) {
30976 if (MI.isDebugInstr())
30977 continue;
30978
30979 assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast
<void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30979, __PRETTY_FUNCTION__))
;
30980 Sym = MI.getOperand(0).getMCSymbol();
30981 break;
30982 }
30983
30984 if (!MF->hasCallSiteLandingPad(Sym))
30985 continue;
30986
30987 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
30988 CallSiteNumToLPad[CSI].push_back(&MBB);
30989 MaxCSNum = std::max(MaxCSNum, CSI);
30990 }
30991 }
30992
30993 // Get an ordered list of the machine basic blocks for the jump table.
30994 std::vector<MachineBasicBlock *> LPadList;
30995 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
30996 LPadList.reserve(CallSiteNumToLPad.size());
30997
30998 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
30999 for (auto &LP : CallSiteNumToLPad[CSI]) {
31000 LPadList.push_back(LP);
31001 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
31002 }
31003 }
31004
31005 assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31006, __PRETTY_FUNCTION__))
31006 "No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31006, __PRETTY_FUNCTION__))
;
31007
31008 // Create the MBBs for the dispatch code.
31009
31010 // Shove the dispatch's address into the return slot in the function context.
31011 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
31012 DispatchBB->setIsEHPad(true);
31013
31014 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
31015 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
31016 DispatchBB->addSuccessor(TrapBB);
31017
31018 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
31019 DispatchBB->addSuccessor(DispContBB);
31020
31021 // Insert MBBs.
31022 MF->push_back(DispatchBB);
31023 MF->push_back(DispContBB);
31024 MF->push_back(TrapBB);
31025
31026 // Insert code into the entry block that creates and registers the function
31027 // context.
31028 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
31029
31030 // Create the jump table and associated information
31031 unsigned JTE = getJumpTableEncoding();
31032 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
31033 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
31034
31035 const X86RegisterInfo &RI = TII->getRegisterInfo();
31036 // Add a register mask with no preserved registers. This results in all
31037 // registers being marked as clobbered.
31038 if (RI.hasBasePointer(*MF)) {
31039 const bool FPIs64Bit =
31040 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
31041 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
31042 MFI->setRestoreBasePointer(MF);
31043
31044 Register FP = RI.getFrameRegister(*MF);
31045 Register BP = RI.getBaseRegister();
31046 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
31047 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
31048 MFI->getRestoreBasePointerOffset())
31049 .addRegMask(RI.getNoPreservedMask());
31050 } else {
31051 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
31052 .addRegMask(RI.getNoPreservedMask());
31053 }
31054
31055 // IReg is used as an index in a memory operand and therefore can't be SP
31056 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
31057 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
31058 Subtarget.is64Bit() ? 8 : 4);
31059 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
31060 .addReg(IReg)
31061 .addImm(LPadList.size());
31062 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
31063
31064 if (Subtarget.is64Bit()) {
31065 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
31066 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
31067
31068 // leaq .LJTI0_0(%rip), BReg
31069 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
31070 .addReg(X86::RIP)
31071 .addImm(1)
31072 .addReg(0)
31073 .addJumpTableIndex(MJTI)
31074 .addReg(0);
31075 // movzx IReg64, IReg
31076 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
31077 .addImm(0)
31078 .addReg(IReg)
31079 .addImm(X86::sub_32bit);
31080
31081 switch (JTE) {
31082 case MachineJumpTableInfo::EK_BlockAddress:
31083 // jmpq *(BReg,IReg64,8)
31084 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
31085 .addReg(BReg)
31086 .addImm(8)
31087 .addReg(IReg64)
31088 .addImm(0)
31089 .addReg(0);
31090 break;
31091 case MachineJumpTableInfo::EK_LabelDifference32: {
31092 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
31093 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
31094 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
31095
31096 // movl (BReg,IReg64,4), OReg
31097 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
31098 .addReg(BReg)
31099 .addImm(4)
31100 .addReg(IReg64)
31101 .addImm(0)
31102 .addReg(0);
31103 // movsx OReg64, OReg
31104 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
31105 // addq BReg, OReg64, TReg
31106 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
31107 .addReg(OReg64)
31108 .addReg(BReg);
31109 // jmpq *TReg
31110 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
31111 break;
31112 }
31113 default:
31114 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31114)
;
31115 }
31116 } else {
31117 // jmpl *.LJTI0_0(,IReg,4)
31118 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
31119 .addReg(0)
31120 .addImm(4)
31121 .addReg(IReg)
31122 .addJumpTableIndex(MJTI)
31123 .addReg(0);
31124 }
31125
31126 // Add the jump table entries as successors to the MBB.
31127 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
31128 for (auto &LP : LPadList)
31129 if (SeenMBBs.insert(LP).second)
31130 DispContBB->addSuccessor(LP);
31131
31132 // N.B. the order the invoke BBs are processed in doesn't matter here.
31133 SmallVector<MachineBasicBlock *, 64> MBBLPads;
31134 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
31135 for (MachineBasicBlock *MBB : InvokeBBs) {
31136 // Remove the landing pad successor from the invoke block and replace it
31137 // with the new dispatch block.
31138 // Keep a copy of Successors since it's modified inside the loop.
31139 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
31140 MBB->succ_rend());
31141 // FIXME: Avoid quadratic complexity.
31142 for (auto MBBS : Successors) {
31143 if (MBBS->isEHPad()) {
31144 MBB->removeSuccessor(MBBS);
31145 MBBLPads.push_back(MBBS);
31146 }
31147 }
31148
31149 MBB->addSuccessor(DispatchBB);
31150
31151 // Find the invoke call and mark all of the callee-saved registers as
31152 // 'implicit defined' so that they're spilled. This prevents code from
31153 // moving instructions to before the EH block, where they will never be
31154 // executed.
31155 for (auto &II : reverse(*MBB)) {
31156 if (!II.isCall())
31157 continue;
31158
31159 DenseMap<unsigned, bool> DefRegs;
31160 for (auto &MOp : II.operands())
31161 if (MOp.isReg())
31162 DefRegs[MOp.getReg()] = true;
31163
31164 MachineInstrBuilder MIB(*MF, &II);
31165 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
31166 unsigned Reg = SavedRegs[RegIdx];
31167 if (!DefRegs[Reg])
31168 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
31169 }
31170
31171 break;
31172 }
31173 }
31174
31175 // Mark all former landing pads as non-landing pads. The dispatch is the only
31176 // landing pad now.
31177 for (auto &LP : MBBLPads)
31178 LP->setIsEHPad(false);
31179
31180 // The instruction is gone now.
31181 MI.eraseFromParent();
31182 return BB;
31183}
31184
31185MachineBasicBlock *
31186X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
31187 MachineBasicBlock *BB) const {
31188 MachineFunction *MF = BB->getParent();
31189 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31190 DebugLoc DL = MI.getDebugLoc();
31191
31192 switch (MI.getOpcode()) {
31193 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31193)
;
31194 case X86::TLS_addr32:
31195 case X86::TLS_addr64:
31196 case X86::TLS_base_addr32:
31197 case X86::TLS_base_addr64:
31198 return EmitLoweredTLSAddr(MI, BB);
31199 case X86::RETPOLINE_CALL32:
31200 case X86::RETPOLINE_CALL64:
31201 case X86::RETPOLINE_TCRETURN32:
31202 case X86::RETPOLINE_TCRETURN64:
31203 return EmitLoweredRetpoline(MI, BB);
31204 case X86::CATCHRET:
31205 return EmitLoweredCatchRet(MI, BB);
31206 case X86::CATCHPAD:
31207 return EmitLoweredCatchPad(MI, BB);
31208 case X86::SEG_ALLOCA_32:
31209 case X86::SEG_ALLOCA_64:
31210 return EmitLoweredSegAlloca(MI, BB);
31211 case X86::TLSCall_32:
31212 case X86::TLSCall_64:
31213 return EmitLoweredTLSCall(MI, BB);
31214 case X86::CMOV_FR32:
31215 case X86::CMOV_FR32X:
31216 case X86::CMOV_FR64:
31217 case X86::CMOV_FR64X:
31218 case X86::CMOV_GR8:
31219 case X86::CMOV_GR16:
31220 case X86::CMOV_GR32:
31221 case X86::CMOV_RFP32:
31222 case X86::CMOV_RFP64:
31223 case X86::CMOV_RFP80:
31224 case X86::CMOV_VR128:
31225 case X86::CMOV_VR128X:
31226 case X86::CMOV_VR256:
31227 case X86::CMOV_VR256X:
31228 case X86::CMOV_VR512:
31229 case X86::CMOV_VK2:
31230 case X86::CMOV_VK4:
31231 case X86::CMOV_VK8:
31232 case X86::CMOV_VK16:
31233 case X86::CMOV_VK32:
31234 case X86::CMOV_VK64:
31235 return EmitLoweredSelect(MI, BB);
31236
31237 case X86::RDFLAGS32:
31238 case X86::RDFLAGS64: {
31239 unsigned PushF =
31240 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
31241 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
31242 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
31243 // Permit reads of the EFLAGS and DF registers without them being defined.
31244 // This intrinsic exists to read external processor state in flags, such as
31245 // the trap flag, interrupt flag, and direction flag, none of which are
31246 // modeled by the backend.
31247 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31248, __PRETTY_FUNCTION__))
31248 "Unexpected register in operand!")((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31248, __PRETTY_FUNCTION__))
;
31249 Push->getOperand(2).setIsUndef();
31250 assert(Push->getOperand(3).getReg() == X86::DF &&((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31251, __PRETTY_FUNCTION__))
31251 "Unexpected register in operand!")((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31251, __PRETTY_FUNCTION__))
;
31252 Push->getOperand(3).setIsUndef();
31253 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
31254
31255 MI.eraseFromParent(); // The pseudo is gone now.
31256 return BB;
31257 }
31258
31259 case X86::WRFLAGS32:
31260 case X86::WRFLAGS64: {
31261 unsigned Push =
31262 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
31263 unsigned PopF =
31264 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
31265 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
31266 BuildMI(*BB, MI, DL, TII->get(PopF));
31267
31268 MI.eraseFromParent(); // The pseudo is gone now.
31269 return BB;
31270 }
31271
31272 case X86::FP32_TO_INT16_IN_MEM:
31273 case X86::FP32_TO_INT32_IN_MEM:
31274 case X86::FP32_TO_INT64_IN_MEM:
31275 case X86::FP64_TO_INT16_IN_MEM:
31276 case X86::FP64_TO_INT32_IN_MEM:
31277 case X86::FP64_TO_INT64_IN_MEM:
31278 case X86::FP80_TO_INT16_IN_MEM:
31279 case X86::FP80_TO_INT32_IN_MEM:
31280 case X86::FP80_TO_INT64_IN_MEM: {
31281 // Change the floating point control register to use "round towards zero"
31282 // mode when truncating to an integer value.
31283 int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
31284 addFrameReference(BuildMI(*BB, MI, DL,
31285 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
31286
31287 // Load the old value of the control word...
31288 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
31289 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
31290 OrigCWFrameIdx);
31291
31292 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
31293 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
31294 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
31295 .addReg(OldCW, RegState::Kill).addImm(0xC00);
31296
31297 // Extract to 16 bits.
31298 Register NewCW16 =
31299 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
31300 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
31301 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
31302
31303 // Prepare memory for FLDCW.
31304 int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
31305 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
31306 NewCWFrameIdx)
31307 .addReg(NewCW16, RegState::Kill);
31308
31309 // Reload the modified control word now...
31310 addFrameReference(BuildMI(*BB, MI, DL,
31311 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
31312
31313 // Get the X86 opcode to use.
31314 unsigned Opc;
31315 switch (MI.getOpcode()) {
31316 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31316)
;
31317 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
31318 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
31319 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
31320 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
31321 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
31322 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
31323 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
31324 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
31325 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
31326 }
31327
31328 X86AddressMode AM = getAddressFromInstr(&MI, 0);
31329 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
31330 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
31331
31332 // Reload the original control word now.
31333 addFrameReference(BuildMI(*BB, MI, DL,
31334 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
31335
31336 MI.eraseFromParent(); // The pseudo instruction is gone now.
31337 return BB;
31338 }
31339
31340 // xbegin
31341 case X86::XBEGIN:
31342 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
31343
31344 case X86::VASTART_SAVE_XMM_REGS:
31345 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
31346
31347 case X86::VAARG_64:
31348 return EmitVAARG64WithCustomInserter(MI, BB);
31349
31350 case X86::EH_SjLj_SetJmp32:
31351 case X86::EH_SjLj_SetJmp64:
31352 return emitEHSjLjSetJmp(MI, BB);
31353
31354 case X86::EH_SjLj_LongJmp32:
31355 case X86::EH_SjLj_LongJmp64:
31356 return emitEHSjLjLongJmp(MI, BB);
31357
31358 case X86::Int_eh_sjlj_setup_dispatch:
31359 return EmitSjLjDispatchBlock(MI, BB);
31360
31361 case TargetOpcode::STATEPOINT:
31362 // As an implementation detail, STATEPOINT shares the STACKMAP format at
31363 // this point in the process. We diverge later.
31364 return emitPatchPoint(MI, BB);
31365
31366 case TargetOpcode::STACKMAP:
31367 case TargetOpcode::PATCHPOINT:
31368 return emitPatchPoint(MI, BB);
31369
31370 case TargetOpcode::PATCHABLE_EVENT_CALL:
31371 return emitXRayCustomEvent(MI, BB);
31372
31373 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
31374 return emitXRayTypedEvent(MI, BB);
31375
31376 case X86::LCMPXCHG8B: {
31377 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31378 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
31379 // requires a memory operand. If it happens that current architecture is
31380 // i686 and for current function we need a base pointer
31381 // - which is ESI for i686 - register allocator would not be able to
31382 // allocate registers for an address in form of X(%reg, %reg, Y)
31383 // - there never would be enough unreserved registers during regalloc
31384 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
31385 // We are giving a hand to register allocator by precomputing the address in
31386 // a new vreg using LEA.
31387
31388 // If it is not i686 or there is no base pointer - nothing to do here.
31389 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
31390 return BB;
31391
31392 // Even though this code does not necessarily needs the base pointer to
31393 // be ESI, we check for that. The reason: if this assert fails, there are
31394 // some changes happened in the compiler base pointer handling, which most
31395 // probably have to be addressed somehow here.
31396 assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31398, __PRETTY_FUNCTION__))
31397 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31398, __PRETTY_FUNCTION__))
31398 "base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31398, __PRETTY_FUNCTION__))
;
31399
31400 MachineRegisterInfo &MRI = MF->getRegInfo();
31401 MVT SPTy = getPointerTy(MF->getDataLayout());
31402 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
31403 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
31404
31405 X86AddressMode AM = getAddressFromInstr(&MI, 0);
31406 // Regalloc does not need any help when the memory operand of CMPXCHG8B
31407 // does not use index register.
31408 if (AM.IndexReg == X86::NoRegister)
31409 return BB;
31410
31411 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
31412 // four operand definitions that are E[ABCD] registers. We skip them and
31413 // then insert the LEA.
31414 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
31415 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
31416 RMBBI->definesRegister(X86::EBX) ||
31417 RMBBI->definesRegister(X86::ECX) ||
31418 RMBBI->definesRegister(X86::EDX))) {
31419 ++RMBBI;
31420 }
31421 MachineBasicBlock::iterator MBBI(RMBBI);
31422 addFullAddress(
31423 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
31424
31425 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
31426
31427 return BB;
31428 }
31429 case X86::LCMPXCHG16B:
31430 return BB;
31431 case X86::LCMPXCHG8B_SAVE_EBX:
31432 case X86::LCMPXCHG16B_SAVE_RBX: {
31433 unsigned BasePtr =
31434 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
31435 if (!BB->isLiveIn(BasePtr))
31436 BB->addLiveIn(BasePtr);
31437 return BB;
31438 }
31439 }
31440}
31441
31442//===----------------------------------------------------------------------===//
31443// X86 Optimization Hooks
31444//===----------------------------------------------------------------------===//
31445
31446bool
31447X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
31448 const APInt &Demanded,
31449 TargetLoweringOpt &TLO) const {
31450 // Only optimize Ands to prevent shrinking a constant that could be
31451 // matched by movzx.
31452 if (Op.getOpcode() != ISD::AND)
31453 return false;
31454
31455 EVT VT = Op.getValueType();
31456
31457 // Ignore vectors.
31458 if (VT.isVector())
31459 return false;
31460
31461 unsigned Size = VT.getSizeInBits();
31462
31463 // Make sure the RHS really is a constant.
31464 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
31465 if (!C)
31466 return false;
31467
31468 const APInt &Mask = C->getAPIntValue();
31469
31470 // Clear all non-demanded bits initially.
31471 APInt ShrunkMask = Mask & Demanded;
31472
31473 // Find the width of the shrunk mask.
31474 unsigned Width = ShrunkMask.getActiveBits();
31475
31476 // If the mask is all 0s there's nothing to do here.
31477 if (Width == 0)
31478 return false;
31479
31480 // Find the next power of 2 width, rounding up to a byte.
31481 Width = PowerOf2Ceil(std::max(Width, 8U));
31482 // Truncate the width to size to handle illegal types.
31483 Width = std::min(Width, Size);
31484
31485 // Calculate a possible zero extend mask for this constant.
31486 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
31487
31488 // If we aren't changing the mask, just return true to keep it and prevent
31489 // the caller from optimizing.
31490 if (ZeroExtendMask == Mask)
31491 return true;
31492
31493 // Make sure the new mask can be represented by a combination of mask bits
31494 // and non-demanded bits.
31495 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
31496 return false;
31497
31498 // Replace the constant with the zero extend mask.
31499 SDLoc DL(Op);
31500 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
31501 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
31502 return TLO.CombineTo(Op, NewOp);
31503}
31504
31505void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
31506 KnownBits &Known,
31507 const APInt &DemandedElts,
31508 const SelectionDAG &DAG,
31509 unsigned Depth) const {
31510 unsigned BitWidth = Known.getBitWidth();
31511 unsigned Opc = Op.getOpcode();
31512 EVT VT = Op.getValueType();
31513 assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
31514 Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
31515 Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
31516 Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
31517 "Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
31518 " is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31518, __PRETTY_FUNCTION__))
;
31519
31520 Known.resetAll();
31521 switch (Opc) {
31522 default: break;
31523 case X86ISD::SETCC:
31524 Known.Zero.setBitsFrom(1);
31525 break;
31526 case X86ISD::MOVMSK: {
31527 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
31528 Known.Zero.setBitsFrom(NumLoBits);
31529 break;
31530 }
31531 case X86ISD::PEXTRB:
31532 case X86ISD::PEXTRW: {
31533 SDValue Src = Op.getOperand(0);
31534 EVT SrcVT = Src.getValueType();
31535 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
31536 Op.getConstantOperandVal(1));
31537 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
31538 Known = Known.zextOrTrunc(BitWidth, false);
31539 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
31540 break;
31541 }
31542 case X86ISD::VSRAI:
31543 case X86ISD::VSHLI:
31544 case X86ISD::VSRLI: {
31545 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31546 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
31547 Known.setAllZero();
31548 break;
31549 }
31550
31551 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
31552 unsigned ShAmt = ShiftImm->getZExtValue();
31553 if (Opc == X86ISD::VSHLI) {
31554 Known.Zero <<= ShAmt;
31555 Known.One <<= ShAmt;
31556 // Low bits are known zero.
31557 Known.Zero.setLowBits(ShAmt);
31558 } else if (Opc == X86ISD::VSRLI) {
31559 Known.Zero.lshrInPlace(ShAmt);
31560 Known.One.lshrInPlace(ShAmt);
31561 // High bits are known zero.
31562 Known.Zero.setHighBits(ShAmt);
31563 } else {
31564 Known.Zero.ashrInPlace(ShAmt);
31565 Known.One.ashrInPlace(ShAmt);
31566 }
31567 }
31568 break;
31569 }
31570 case X86ISD::PACKUS: {
31571 // PACKUS is just a truncation if the upper half is zero.
31572 APInt DemandedLHS, DemandedRHS;
31573 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
31574
31575 Known.One = APInt::getAllOnesValue(BitWidth * 2);
31576 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
31577
31578 KnownBits Known2;
31579 if (!!DemandedLHS) {
31580 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
31581 Known.One &= Known2.One;
31582 Known.Zero &= Known2.Zero;
31583 }
31584 if (!!DemandedRHS) {
31585 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
31586 Known.One &= Known2.One;
31587 Known.Zero &= Known2.Zero;
31588 }
31589
31590 if (Known.countMinLeadingZeros() < BitWidth)
31591 Known.resetAll();
31592 Known = Known.trunc(BitWidth);
31593 break;
31594 }
31595 case X86ISD::ANDNP: {
31596 KnownBits Known2;
31597 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
31598 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
31599
31600 // ANDNP = (~X & Y);
31601 Known.One &= Known2.Zero;
31602 Known.Zero |= Known2.One;
31603 break;
31604 }
31605 case X86ISD::FOR: {
31606 KnownBits Known2;
31607 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
31608 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
31609
31610 // Output known-0 bits are only known if clear in both the LHS & RHS.
31611 Known.Zero &= Known2.Zero;
31612 // Output known-1 are known to be set if set in either the LHS | RHS.
31613 Known.One |= Known2.One;
31614 break;
31615 }
31616 case X86ISD::PSADBW: {
31617 assert(VT.getScalarType() == MVT::i64 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31619, __PRETTY_FUNCTION__))
31618 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31619, __PRETTY_FUNCTION__))
31619 "Unexpected PSADBW types")((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31619, __PRETTY_FUNCTION__))
;
31620
31621 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
31622 Known.Zero.setBitsFrom(16);
31623 break;
31624 }
31625 case X86ISD::CMOV: {
31626 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
31627 // If we don't know any bits, early out.
31628 if (Known.isUnknown())
31629 break;
31630 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
31631
31632 // Only known if known in both the LHS and RHS.
31633 Known.One &= Known2.One;
31634 Known.Zero &= Known2.Zero;
31635 break;
31636 }
31637 }
31638
31639 // Handle target shuffles.
31640 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
31641 if (isTargetShuffle(Opc)) {
31642 bool IsUnary;
31643 SmallVector<int, 64> Mask;
31644 SmallVector<SDValue, 2> Ops;
31645 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
31646 IsUnary)) {
31647 unsigned NumOps = Ops.size();
31648 unsigned NumElts = VT.getVectorNumElements();
31649 if (Mask.size() == NumElts) {
31650 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
31651 Known.Zero.setAllBits(); Known.One.setAllBits();
31652 for (unsigned i = 0; i != NumElts; ++i) {
31653 if (!DemandedElts[i])
31654 continue;
31655 int M = Mask[i];
31656 if (M == SM_SentinelUndef) {
31657 // For UNDEF elements, we don't know anything about the common state
31658 // of the shuffle result.
31659 Known.resetAll();
31660 break;
31661 } else if (M == SM_SentinelZero) {
31662 Known.One.clearAllBits();
31663 continue;
31664 }
31665 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31666, __PRETTY_FUNCTION__))
31666 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31666, __PRETTY_FUNCTION__))
;
31667
31668 unsigned OpIdx = (unsigned)M / NumElts;
31669 unsigned EltIdx = (unsigned)M % NumElts;
31670 if (Ops[OpIdx].getValueType() != VT) {
31671 // TODO - handle target shuffle ops with different value types.
31672 Known.resetAll();
31673 break;
31674 }
31675 DemandedOps[OpIdx].setBit(EltIdx);
31676 }
31677 // Known bits are the values that are shared by every demanded element.
31678 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
31679 if (!DemandedOps[i])
31680 continue;
31681 KnownBits Known2 =
31682 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
31683 Known.One &= Known2.One;
31684 Known.Zero &= Known2.Zero;
31685 }
31686 }
31687 }
31688 }
31689}
31690
31691unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
31692 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
31693 unsigned Depth) const {
31694 EVT VT = Op.getValueType();
31695 unsigned VTBits = VT.getScalarSizeInBits();
31696 unsigned Opcode = Op.getOpcode();
31697 switch (Opcode) {
31698 case X86ISD::SETCC_CARRY:
31699 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
31700 return VTBits;
31701
31702 case X86ISD::VTRUNC: {
31703 // TODO: Add DemandedElts support.
31704 SDValue Src = Op.getOperand(0);
31705 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
31706 assert(VTBits < NumSrcBits && "Illegal truncation input type")((VTBits < NumSrcBits && "Illegal truncation input type"
) ? static_cast<void> (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31706, __PRETTY_FUNCTION__))
;
31707 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
31708 if (Tmp > (NumSrcBits - VTBits))
31709 return Tmp - (NumSrcBits - VTBits);
31710 return 1;
31711 }
31712
31713 case X86ISD::PACKSS: {
31714 // PACKSS is just a truncation if the sign bits extend to the packed size.
31715 APInt DemandedLHS, DemandedRHS;
31716 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
31717 DemandedRHS);
31718
31719 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
31720 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
31721 if (!!DemandedLHS)
31722 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
31723 if (!!DemandedRHS)
31724 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
31725 unsigned Tmp = std::min(Tmp0, Tmp1);
31726 if (Tmp > (SrcBits - VTBits))
31727 return Tmp - (SrcBits - VTBits);
31728 return 1;
31729 }
31730
31731 case X86ISD::VSHLI: {
31732 SDValue Src = Op.getOperand(0);
31733 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
31734 if (ShiftVal.uge(VTBits))
31735 return VTBits; // Shifted all bits out --> zero.
31736 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
31737 if (ShiftVal.uge(Tmp))
31738 return 1; // Shifted all sign bits out --> unknown.
31739 return Tmp - ShiftVal.getZExtValue();
31740 }
31741
31742 case X86ISD::VSRAI: {
31743 SDValue Src = Op.getOperand(0);
31744 APInt ShiftVal = Op.getConstantOperandAPInt(1);
31745 if (ShiftVal.uge(VTBits - 1))
31746 return VTBits; // Sign splat.
31747 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
31748 ShiftVal += Tmp;
31749 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
31750 }
31751
31752 case X86ISD::PCMPGT:
31753 case X86ISD::PCMPEQ:
31754 case X86ISD::CMPP:
31755 case X86ISD::VPCOM:
31756 case X86ISD::VPCOMU:
31757 // Vector compares return zero/all-bits result values.
31758 return VTBits;
31759
31760 case X86ISD::ANDNP: {
31761 unsigned Tmp0 =
31762 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
31763 if (Tmp0 == 1) return 1; // Early out.
31764 unsigned Tmp1 =
31765 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
31766 return std::min(Tmp0, Tmp1);
31767 }
31768
31769 case X86ISD::CMOV: {
31770 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
31771 if (Tmp0 == 1) return 1; // Early out.
31772 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
31773 return std::min(Tmp0, Tmp1);
31774 }
31775 }
31776
31777 // Handle target shuffles.
31778 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
31779 if (isTargetShuffle(Opcode)) {
31780 bool IsUnary;
31781 SmallVector<int, 64> Mask;
31782 SmallVector<SDValue, 2> Ops;
31783 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
31784 IsUnary)) {
31785 unsigned NumOps = Ops.size();
31786 unsigned NumElts = VT.getVectorNumElements();
31787 if (Mask.size() == NumElts) {
31788 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
31789 for (unsigned i = 0; i != NumElts; ++i) {
31790 if (!DemandedElts[i])
31791 continue;
31792 int M = Mask[i];
31793 if (M == SM_SentinelUndef) {
31794 // For UNDEF elements, we don't know anything about the common state
31795 // of the shuffle result.
31796 return 1;
31797 } else if (M == SM_SentinelZero) {
31798 // Zero = all sign bits.
31799 continue;
31800 }
31801 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31802, __PRETTY_FUNCTION__))
31802 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31802, __PRETTY_FUNCTION__))
;
31803
31804 unsigned OpIdx = (unsigned)M / NumElts;
31805 unsigned EltIdx = (unsigned)M % NumElts;
31806 if (Ops[OpIdx].getValueType() != VT) {
31807 // TODO - handle target shuffle ops with different value types.
31808 return 1;
31809 }
31810 DemandedOps[OpIdx].setBit(EltIdx);
31811 }
31812 unsigned Tmp0 = VTBits;
31813 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
31814 if (!DemandedOps[i])
31815 continue;
31816 unsigned Tmp1 =
31817 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
31818 Tmp0 = std::min(Tmp0, Tmp1);
31819 }
31820 return Tmp0;
31821 }
31822 }
31823 }
31824
31825 // Fallback case.
31826 return 1;
31827}
31828
31829SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
31830 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
31831 return N->getOperand(0);
31832 return N;
31833}
31834
31835// Attempt to match a combined shuffle mask against supported unary shuffle
31836// instructions.
31837// TODO: Investigate sharing more of this with shuffle lowering.
31838static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
31839 bool AllowFloatDomain, bool AllowIntDomain,
31840 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
31841 const X86Subtarget &Subtarget, unsigned &Shuffle,
31842 MVT &SrcVT, MVT &DstVT) {
31843 unsigned NumMaskElts = Mask.size();
31844 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
31845
31846 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
31847 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
31848 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
31849 Shuffle = X86ISD::VZEXT_MOVL;
31850 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
31851 return true;
31852 }
31853
31854 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
31855 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
31856 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
31857 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
31858 unsigned MaxScale = 64 / MaskEltSize;
31859 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
31860 bool MatchAny = true;
31861 bool MatchZero = true;
31862 unsigned NumDstElts = NumMaskElts / Scale;
31863 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
31864 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
31865 MatchAny = MatchZero = false;
31866 break;
31867 }
31868 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
31869 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
31870 }
31871 if (MatchAny || MatchZero) {
31872 assert(MatchZero && "Failed to match zext but matched aext?")((MatchZero && "Failed to match zext but matched aext?"
) ? static_cast<void> (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31872, __PRETTY_FUNCTION__))
;
31873 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
31874 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
31875 MVT::getIntegerVT(MaskEltSize);
31876 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
31877
31878 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
31879 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
31880
31881 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
31882 if (SrcVT.getVectorNumElements() != NumDstElts)
31883 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
31884
31885 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
31886 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
31887 return true;
31888 }
31889 }
31890 }
31891
31892 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
31893 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
31894 isUndefOrEqual(Mask[0], 0) &&
31895 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
31896 Shuffle = X86ISD::VZEXT_MOVL;
31897 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
31898 return true;
31899 }
31900
31901 // Check if we have SSE3 which will let us use MOVDDUP etc. The
31902 // instructions are no slower than UNPCKLPD but has the option to
31903 // fold the input operand into even an unaligned memory load.
31904 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
31905 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
31906 Shuffle = X86ISD::MOVDDUP;
31907 SrcVT = DstVT = MVT::v2f64;
31908 return true;
31909 }
31910 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
31911 Shuffle = X86ISD::MOVSLDUP;
31912 SrcVT = DstVT = MVT::v4f32;
31913 return true;
31914 }
31915 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
31916 Shuffle = X86ISD::MOVSHDUP;
31917 SrcVT = DstVT = MVT::v4f32;
31918 return true;
31919 }
31920 }
31921
31922 if (MaskVT.is256BitVector() && AllowFloatDomain) {
31923 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31923, __PRETTY_FUNCTION__))
;
31924 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
31925 Shuffle = X86ISD::MOVDDUP;
31926 SrcVT = DstVT = MVT::v4f64;
31927 return true;
31928 }
31929 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
31930 Shuffle = X86ISD::MOVSLDUP;
31931 SrcVT = DstVT = MVT::v8f32;
31932 return true;
31933 }
31934 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
31935 Shuffle = X86ISD::MOVSHDUP;
31936 SrcVT = DstVT = MVT::v8f32;
31937 return true;
31938 }
31939 }
31940
31941 if (MaskVT.is512BitVector() && AllowFloatDomain) {
31942 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31943, __PRETTY_FUNCTION__))
31943 "AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31943, __PRETTY_FUNCTION__))
;
31944 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
31945 Shuffle = X86ISD::MOVDDUP;
31946 SrcVT = DstVT = MVT::v8f64;
31947 return true;
31948 }
31949 if (isTargetShuffleEquivalent(
31950 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
31951 Shuffle = X86ISD::MOVSLDUP;
31952 SrcVT = DstVT = MVT::v16f32;
31953 return true;
31954 }
31955 if (isTargetShuffleEquivalent(
31956 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
31957 Shuffle = X86ISD::MOVSHDUP;
31958 SrcVT = DstVT = MVT::v16f32;
31959 return true;
31960 }
31961 }
31962
31963 return false;
31964}
31965
31966// Attempt to match a combined shuffle mask against supported unary immediate
31967// permute instructions.
31968// TODO: Investigate sharing more of this with shuffle lowering.
31969static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
31970 const APInt &Zeroable,
31971 bool AllowFloatDomain, bool AllowIntDomain,
31972 const X86Subtarget &Subtarget,
31973 unsigned &Shuffle, MVT &ShuffleVT,
31974 unsigned &PermuteImm) {
31975 unsigned NumMaskElts = Mask.size();
31976 unsigned InputSizeInBits = MaskVT.getSizeInBits();
31977 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
31978 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
31979
31980 bool ContainsZeros =
31981 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
31982
31983 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
31984 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
31985 // Check for lane crossing permutes.
31986 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
31987 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
31988 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
31989 Shuffle = X86ISD::VPERMI;
31990 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
31991 PermuteImm = getV4X86ShuffleImm(Mask);
31992 return true;
31993 }
31994 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
31995 SmallVector<int, 4> RepeatedMask;
31996 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
31997 Shuffle = X86ISD::VPERMI;
31998 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
31999 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
32000 return true;
32001 }
32002 }
32003 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
32004 // VPERMILPD can permute with a non-repeating shuffle.
32005 Shuffle = X86ISD::VPERMILPI;
32006 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
32007 PermuteImm = 0;
32008 for (int i = 0, e = Mask.size(); i != e; ++i) {
32009 int M = Mask[i];
32010 if (M == SM_SentinelUndef)
32011 continue;
32012 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32012, __PRETTY_FUNCTION__))
;
32013 PermuteImm |= (M & 1) << i;
32014 }
32015 return true;
32016 }
32017 }
32018
32019 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
32020 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
32021 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
32022 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
32023 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
32024 SmallVector<int, 4> RepeatedMask;
32025 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
32026 // Narrow the repeated mask to create 32-bit element permutes.
32027 SmallVector<int, 4> WordMask = RepeatedMask;
32028 if (MaskScalarSizeInBits == 64)
32029 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
32030
32031 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
32032 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
32033 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
32034 PermuteImm = getV4X86ShuffleImm(WordMask);
32035 return true;
32036 }
32037 }
32038
32039 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
32040 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
32041 SmallVector<int, 4> RepeatedMask;
32042 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
32043 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
32044 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
32045
32046 // PSHUFLW: permute lower 4 elements only.
32047 if (isUndefOrInRange(LoMask, 0, 4) &&
32048 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
32049 Shuffle = X86ISD::PSHUFLW;
32050 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
32051 PermuteImm = getV4X86ShuffleImm(LoMask);
32052 return true;
32053 }
32054
32055 // PSHUFHW: permute upper 4 elements only.
32056 if (isUndefOrInRange(HiMask, 4, 8) &&
32057 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
32058 // Offset the HiMask so that we can create the shuffle immediate.
32059 int OffsetHiMask[4];
32060 for (int i = 0; i != 4; ++i)
32061 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
32062
32063 Shuffle = X86ISD::PSHUFHW;
32064 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
32065 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
32066 return true;
32067 }
32068 }
32069 }
32070
32071 // Attempt to match against byte/bit shifts.
32072 // FIXME: Add 512-bit support.
32073 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
32074 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
32075 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
32076 Mask, 0, Zeroable, Subtarget);
32077 if (0 < ShiftAmt) {
32078 PermuteImm = (unsigned)ShiftAmt;
32079 return true;
32080 }
32081 }
32082
32083 return false;
32084}
32085
32086// Attempt to match a combined unary shuffle mask against supported binary
32087// shuffle instructions.
32088// TODO: Investigate sharing more of this with shuffle lowering.
32089static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
32090 bool AllowFloatDomain, bool AllowIntDomain,
32091 SDValue &V1, SDValue &V2, const SDLoc &DL,
32092 SelectionDAG &DAG, const X86Subtarget &Subtarget,
32093 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
32094 bool IsUnary) {
32095 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
32096
32097 if (MaskVT.is128BitVector()) {
32098 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
32099 V2 = V1;
32100 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
32101 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
32102 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
32103 return true;
32104 }
32105 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
32106 V2 = V1;
32107 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
32108 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
32109 return true;
32110 }
32111 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
32112 (AllowFloatDomain || !Subtarget.hasSSE41())) {
32113 std::swap(V1, V2);
32114 Shuffle = X86ISD::MOVSD;
32115 SrcVT = DstVT = MVT::v2f64;
32116 return true;
32117 }
32118 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
32119 (AllowFloatDomain || !Subtarget.hasSSE41())) {
32120 Shuffle = X86ISD::MOVSS;
32121 SrcVT = DstVT = MVT::v4f32;
32122 return true;
32123 }
32124 }
32125
32126 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
32127 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
32128 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
32129 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
32130 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
32131 Subtarget)) {
32132 DstVT = MaskVT;
32133 return true;
32134 }
32135 }
32136
32137 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
32138 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
32139 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
32140 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
32141 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
32142 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
32143 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
32144 DAG, Subtarget)) {
32145 SrcVT = DstVT = MaskVT;
32146 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
32147 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
32148 return true;
32149 }
32150 }
32151
32152 return false;
32153}
32154
32155static bool matchBinaryPermuteShuffle(
32156 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
32157 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
32158 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
32159 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
32160 unsigned NumMaskElts = Mask.size();
32161 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
32162
32163 // Attempt to match against PALIGNR byte rotate.
32164 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
32165 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
32166 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
32167 if (0 < ByteRotation) {
32168 Shuffle = X86ISD::PALIGNR;
32169 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
32170 PermuteImm = ByteRotation;
32171 return true;
32172 }
32173 }
32174
32175 // Attempt to combine to X86ISD::BLENDI.
32176 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
32177 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
32178 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
32179 uint64_t BlendMask = 0;
32180 bool ForceV1Zero = false, ForceV2Zero = false;
32181 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
32182 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
32183 ForceV2Zero, BlendMask)) {
32184 if (MaskVT == MVT::v16i16) {
32185 // We can only use v16i16 PBLENDW if the lanes are repeated.
32186 SmallVector<int, 8> RepeatedMask;
32187 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
32188 RepeatedMask)) {
32189 assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32190, __PRETTY_FUNCTION__))
32190 "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32190, __PRETTY_FUNCTION__))
;
32191 PermuteImm = 0;
32192 for (int i = 0; i < 8; ++i)
32193 if (RepeatedMask[i] >= 8)
32194 PermuteImm |= 1 << i;
32195 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
32196 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
32197 Shuffle = X86ISD::BLENDI;
32198 ShuffleVT = MaskVT;
32199 return true;
32200 }
32201 } else {
32202 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
32203 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
32204 PermuteImm = (unsigned)BlendMask;
32205 Shuffle = X86ISD::BLENDI;
32206 ShuffleVT = MaskVT;
32207 return true;
32208 }
32209 }
32210 }
32211
32212 // Attempt to combine to INSERTPS, but only if it has elements that need to
32213 // be set to zero.
32214 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
32215 MaskVT.is128BitVector() &&
32216 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
32217 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
32218 Shuffle = X86ISD::INSERTPS;
32219 ShuffleVT = MVT::v4f32;
32220 return true;
32221 }
32222
32223 // Attempt to combine to SHUFPD.
32224 if (AllowFloatDomain && EltSizeInBits == 64 &&
32225 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
32226 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
32227 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
32228 bool ForceV1Zero = false, ForceV2Zero = false;
32229 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
32230 PermuteImm, Mask, Zeroable)) {
32231 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
32232 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
32233 Shuffle = X86ISD::SHUFP;
32234 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
32235 return true;
32236 }
32237 }
32238
32239 // Attempt to combine to SHUFPS.
32240 if (AllowFloatDomain && EltSizeInBits == 32 &&
32241 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
32242 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
32243 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
32244 SmallVector<int, 4> RepeatedMask;
32245 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
32246 // Match each half of the repeated mask, to determine if its just
32247 // referencing one of the vectors, is zeroable or entirely undef.
32248 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
32249 int M0 = RepeatedMask[Offset];
32250 int M1 = RepeatedMask[Offset + 1];
32251
32252 if (isUndefInRange(RepeatedMask, Offset, 2)) {
32253 return DAG.getUNDEF(MaskVT);
32254 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
32255 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
32256 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
32257 return getZeroVector(MaskVT, Subtarget, DAG, DL);
32258 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
32259 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
32260 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
32261 return V1;
32262 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
32263 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
32264 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
32265 return V2;
32266 }
32267
32268 return SDValue();
32269 };
32270
32271 int ShufMask[4] = {-1, -1, -1, -1};
32272 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
32273 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
32274
32275 if (Lo && Hi) {
32276 V1 = Lo;
32277 V2 = Hi;
32278 Shuffle = X86ISD::SHUFP;
32279 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
32280 PermuteImm = getV4X86ShuffleImm(ShufMask);
32281 return true;
32282 }
32283 }
32284 }
32285
32286 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
32287 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
32288 MaskVT.is128BitVector() &&
32289 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
32290 Shuffle = X86ISD::INSERTPS;
32291 ShuffleVT = MVT::v4f32;
32292 return true;
32293 }
32294
32295 return false;
32296}
32297
32298static SDValue combineX86ShuffleChainWithExtract(
32299 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
32300 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
32301 const X86Subtarget &Subtarget);
32302
32303/// Combine an arbitrary chain of shuffles into a single instruction if
32304/// possible.
32305///
32306/// This is the leaf of the recursive combine below. When we have found some
32307/// chain of single-use x86 shuffle instructions and accumulated the combined
32308/// shuffle mask represented by them, this will try to pattern match that mask
32309/// into either a single instruction if there is a special purpose instruction
32310/// for this operation, or into a PSHUFB instruction which is a fully general
32311/// instruction but should only be used to replace chains over a certain depth.
32312static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
32313 ArrayRef<int> BaseMask, int Depth,
32314 bool HasVariableMask,
32315 bool AllowVariableMask, SelectionDAG &DAG,
32316 const X86Subtarget &Subtarget) {
32317 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32317, __PRETTY_FUNCTION__))
;
32318 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32319, __PRETTY_FUNCTION__))
32319 "Unexpected number of shuffle inputs!")(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32319, __PRETTY_FUNCTION__))
;
32320
32321 // Find the inputs that enter the chain. Note that multiple uses are OK
32322 // here, we're not going to remove the operands we find.
32323 bool UnaryShuffle = (Inputs.size() == 1);
32324 SDValue V1 = peekThroughBitcasts(Inputs[0]);
32325 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
32326 : peekThroughBitcasts(Inputs[1]));
32327
32328 MVT VT1 = V1.getSimpleValueType();
32329 MVT VT2 = V2.getSimpleValueType();
32330 MVT RootVT = Root.getSimpleValueType();
32331 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32333, __PRETTY_FUNCTION__))
32332 VT2.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32333, __PRETTY_FUNCTION__))
32333 "Vector size mismatch")((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32333, __PRETTY_FUNCTION__))
;
32334
32335 SDLoc DL(Root);
32336 SDValue Res;
32337
32338 unsigned NumBaseMaskElts = BaseMask.size();
32339 if (NumBaseMaskElts == 1) {
32340 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!")
? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32340, __PRETTY_FUNCTION__))
;
32341 return DAG.getBitcast(RootVT, V1);
32342 }
32343
32344 unsigned RootSizeInBits = RootVT.getSizeInBits();
32345 unsigned NumRootElts = RootVT.getVectorNumElements();
32346 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
32347 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
32348 (RootVT.isFloatingPoint() && Depth >= 1) ||
32349 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
32350
32351 // Don't combine if we are a AVX512/EVEX target and the mask element size
32352 // is different from the root element size - this would prevent writemasks
32353 // from being reused.
32354 // TODO - this currently prevents all lane shuffles from occurring.
32355 // TODO - check for writemasks usage instead of always preventing combining.
32356 // TODO - attempt to narrow Mask back to writemask size.
32357 bool IsEVEXShuffle =
32358 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
32359
32360 // Attempt to match a subvector broadcast.
32361 // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
32362 if (UnaryShuffle &&
32363 (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
32364 SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
32365 if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
32366 SDValue Src = Inputs[0];
32367 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
32368 Src.getOperand(0).isUndef() &&
32369 Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
32370 MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
32371 return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
32372 Src.getValueType(),
32373 Src.getOperand(1)));
32374 }
32375 }
32376 }
32377
32378 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
32379
32380 // Handle 128-bit lane shuffles of 256-bit vectors.
32381 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
32382 // we need to use the zeroing feature.
32383 // TODO - this should support binary shuffles.
32384 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
32385 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
32386 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
32387 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
32388 return SDValue(); // Nothing to do!
32389 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
32390 unsigned PermMask = 0;
32391 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
32392 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
32393
32394 Res = DAG.getBitcast(ShuffleVT, V1);
32395 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
32396 DAG.getUNDEF(ShuffleVT),
32397 DAG.getTargetConstant(PermMask, DL, MVT::i8));
32398 return DAG.getBitcast(RootVT, Res);
32399 }
32400
32401 // For masks that have been widened to 128-bit elements or more,
32402 // narrow back down to 64-bit elements.
32403 SmallVector<int, 64> Mask;
32404 if (BaseMaskEltSizeInBits > 64) {
32405 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32405, __PRETTY_FUNCTION__))
;
32406 int MaskScale = BaseMaskEltSizeInBits / 64;
32407 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
32408 } else {
32409 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
32410 }
32411
32412 unsigned NumMaskElts = Mask.size();
32413 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
32414
32415 // Determine the effective mask value type.
32416 FloatDomain &= (32 <= MaskEltSizeInBits);
32417 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
32418 : MVT::getIntegerVT(MaskEltSizeInBits);
32419 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
32420
32421 // Only allow legal mask types.
32422 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
32423 return SDValue();
32424
32425 // Attempt to match the mask against known shuffle patterns.
32426 MVT ShuffleSrcVT, ShuffleVT;
32427 unsigned Shuffle, PermuteImm;
32428
32429 // Which shuffle domains are permitted?
32430 // Permit domain crossing at higher combine depths.
32431 // TODO: Should we indicate which domain is preferred if both are allowed?
32432 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
32433 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
32434 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
32435
32436 // Determine zeroable mask elements.
32437 APInt KnownUndef, KnownZero;
32438 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
32439 APInt Zeroable = KnownUndef | KnownZero;
32440
32441 if (UnaryShuffle) {
32442 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
32443 // directly if we don't shuffle the lower element and we shuffle the upper
32444 // (zero) elements within themselves.
32445 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
32446 (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
32447 MaskEltSizeInBits) == 0) {
32448 unsigned Scale =
32449 cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
32450 MaskEltSizeInBits;
32451 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
32452 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
32453 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
32454 return DAG.getBitcast(RootVT, V1);
32455 }
32456 }
32457
32458 // Attempt to match against broadcast-from-vector.
32459 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
32460 if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
32461 && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
32462 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
32463 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
32464 if (V1.getValueType() == MaskVT &&
32465 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
32466 MayFoldLoad(V1.getOperand(0))) {
32467 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
32468 return SDValue(); // Nothing to do!
32469 Res = V1.getOperand(0);
32470 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
32471 return DAG.getBitcast(RootVT, Res);
32472 }
32473 if (Subtarget.hasAVX2()) {
32474 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
32475 return SDValue(); // Nothing to do!
32476 Res = DAG.getBitcast(MaskVT, V1);
32477 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
32478 return DAG.getBitcast(RootVT, Res);
32479 }
32480 }
32481 }
32482
32483 SDValue NewV1 = V1; // Save operand in case early exit happens.
32484 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
32485 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
32486 ShuffleVT) &&
32487 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
32488 if (Depth == 0 && Root.getOpcode() == Shuffle)
32489 return SDValue(); // Nothing to do!
32490 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
32491 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
32492 return DAG.getBitcast(RootVT, Res);
32493 }
32494
32495 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
32496 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
32497 PermuteImm) &&
32498 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
32499 if (Depth == 0 && Root.getOpcode() == Shuffle)
32500 return SDValue(); // Nothing to do!
32501 Res = DAG.getBitcast(ShuffleVT, V1);
32502 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
32503 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
32504 return DAG.getBitcast(RootVT, Res);
32505 }
32506 }
32507
32508 SDValue NewV1 = V1; // Save operands in case early exit happens.
32509 SDValue NewV2 = V2;
32510 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
32511 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
32512 ShuffleVT, UnaryShuffle) &&
32513 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
32514 if (Depth == 0 && Root.getOpcode() == Shuffle)
32515 return SDValue(); // Nothing to do!
32516 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
32517 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
32518 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
32519 return DAG.getBitcast(RootVT, Res);
32520 }
32521
32522 NewV1 = V1; // Save operands in case early exit happens.
32523 NewV2 = V2;
32524 if (matchBinaryPermuteShuffle(
32525 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
32526 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
32527 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
32528 if (Depth == 0 && Root.getOpcode() == Shuffle)
32529 return SDValue(); // Nothing to do!
32530 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
32531 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
32532 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
32533 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
32534 return DAG.getBitcast(RootVT, Res);
32535 }
32536
32537 // Typically from here on, we need an integer version of MaskVT.
32538 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
32539 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
32540
32541 // Annoyingly, SSE4A instructions don't map into the above match helpers.
32542 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
32543 uint64_t BitLen, BitIdx;
32544 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
32545 Zeroable)) {
32546 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
32547 return SDValue(); // Nothing to do!
32548 V1 = DAG.getBitcast(IntMaskVT, V1);
32549 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
32550 DAG.getTargetConstant(BitLen, DL, MVT::i8),
32551 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
32552 return DAG.getBitcast(RootVT, Res);
32553 }
32554
32555 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
32556 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
32557 return SDValue(); // Nothing to do!
32558 V1 = DAG.getBitcast(IntMaskVT, V1);
32559 V2 = DAG.getBitcast(IntMaskVT, V2);
32560 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
32561 DAG.getTargetConstant(BitLen, DL, MVT::i8),
32562 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
32563 return DAG.getBitcast(RootVT, Res);
32564 }
32565 }
32566
32567 // Don't try to re-form single instruction chains under any circumstances now
32568 // that we've done encoding canonicalization for them.
32569 if (Depth < 1)
32570 return SDValue();
32571
32572 // Depth threshold above which we can efficiently use variable mask shuffles.
32573 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
32574 AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
32575
32576 bool MaskContainsZeros =
32577 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
32578
32579 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
32580 // If we have a single input lane-crossing shuffle then lower to VPERMV.
32581 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
32582 ((Subtarget.hasAVX2() &&
32583 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
32584 (Subtarget.hasAVX512() &&
32585 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
32586 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
32587 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
32588 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
32589 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
32590 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
32591 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
32592 Res = DAG.getBitcast(MaskVT, V1);
32593 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
32594 return DAG.getBitcast(RootVT, Res);
32595 }
32596
32597 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
32598 // vector as the second source.
32599 if (UnaryShuffle && AllowVariableMask &&
32600 ((Subtarget.hasAVX512() &&
32601 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
32602 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
32603 (Subtarget.hasVLX() &&
32604 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
32605 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
32606 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
32607 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
32608 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
32609 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
32610 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
32611 for (unsigned i = 0; i != NumMaskElts; ++i)
32612 if (Mask[i] == SM_SentinelZero)
32613 Mask[i] = NumMaskElts + i;
32614
32615 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
32616 Res = DAG.getBitcast(MaskVT, V1);
32617 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
32618 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
32619 return DAG.getBitcast(RootVT, Res);
32620 }
32621
32622 // If that failed and either input is extracted then try to combine as a
32623 // shuffle with the larger type.
32624 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
32625 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
32626 DAG, Subtarget))
32627 return WideShuffle;
32628
32629 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
32630 if (AllowVariableMask && !MaskContainsZeros &&
32631 ((Subtarget.hasAVX512() &&
32632 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
32633 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
32634 (Subtarget.hasVLX() &&
32635 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
32636 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
32637 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
32638 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
32639 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
32640 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
32641 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
32642 V1 = DAG.getBitcast(MaskVT, V1);
32643 V2 = DAG.getBitcast(MaskVT, V2);
32644 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
32645 return DAG.getBitcast(RootVT, Res);
32646 }
32647 return SDValue();
32648 }
32649
32650 // See if we can combine a single input shuffle with zeros to a bit-mask,
32651 // which is much simpler than any shuffle.
32652 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
32653 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
32654 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
32655 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
32656 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
32657 APInt UndefElts(NumMaskElts, 0);
32658 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
32659 for (unsigned i = 0; i != NumMaskElts; ++i) {
32660 int M = Mask[i];
32661 if (M == SM_SentinelUndef) {
32662 UndefElts.setBit(i);
32663 continue;
32664 }
32665 if (M == SM_SentinelZero)
32666 continue;
32667 EltBits[i] = AllOnes;
32668 }
32669 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
32670 Res = DAG.getBitcast(MaskVT, V1);
32671 unsigned AndOpcode =
32672 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
32673 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
32674 return DAG.getBitcast(RootVT, Res);
32675 }
32676
32677 // If we have a single input shuffle with different shuffle patterns in the
32678 // the 128-bit lanes use the variable mask to VPERMILPS.
32679 // TODO Combine other mask types at higher depths.
32680 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
32681 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
32682 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
32683 SmallVector<SDValue, 16> VPermIdx;
32684 for (int M : Mask) {
32685 SDValue Idx =
32686 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
32687 VPermIdx.push_back(Idx);
32688 }
32689 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
32690 Res = DAG.getBitcast(MaskVT, V1);
32691 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
32692 return DAG.getBitcast(RootVT, Res);
32693 }
32694
32695 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
32696 // to VPERMIL2PD/VPERMIL2PS.
32697 if (AllowVariableMask && Subtarget.hasXOP() &&
32698 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
32699 MaskVT == MVT::v8f32)) {
32700 // VPERMIL2 Operation.
32701 // Bits[3] - Match Bit.
32702 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
32703 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
32704 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
32705 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
32706 SmallVector<int, 8> VPerm2Idx;
32707 unsigned M2ZImm = 0;
32708 for (int M : Mask) {
32709 if (M == SM_SentinelUndef) {
32710 VPerm2Idx.push_back(-1);
32711 continue;
32712 }
32713 if (M == SM_SentinelZero) {
32714 M2ZImm = 2;
32715 VPerm2Idx.push_back(8);
32716 continue;
32717 }
32718 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
32719 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
32720 VPerm2Idx.push_back(Index);
32721 }
32722 V1 = DAG.getBitcast(MaskVT, V1);
32723 V2 = DAG.getBitcast(MaskVT, V2);
32724 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
32725 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
32726 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
32727 return DAG.getBitcast(RootVT, Res);
32728 }
32729
32730 // If we have 3 or more shuffle instructions or a chain involving a variable
32731 // mask, we can replace them with a single PSHUFB instruction profitably.
32732 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
32733 // instructions, but in practice PSHUFB tends to be *very* fast so we're
32734 // more aggressive.
32735 if (UnaryShuffle && AllowVariableMask &&
32736 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
32737 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
32738 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
32739 SmallVector<SDValue, 16> PSHUFBMask;
32740 int NumBytes = RootVT.getSizeInBits() / 8;
32741 int Ratio = NumBytes / NumMaskElts;
32742 for (int i = 0; i < NumBytes; ++i) {
32743 int M = Mask[i / Ratio];
32744 if (M == SM_SentinelUndef) {
32745 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
32746 continue;
32747 }
32748 if (M == SM_SentinelZero) {
32749 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
32750 continue;
32751 }
32752 M = Ratio * M + i % Ratio;
32753 assert((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ?
static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32753, __PRETTY_FUNCTION__))
;
32754 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
32755 }
32756 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
32757 Res = DAG.getBitcast(ByteVT, V1);
32758 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
32759 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
32760 return DAG.getBitcast(RootVT, Res);
32761 }
32762
32763 // With XOP, if we have a 128-bit binary input shuffle we can always combine
32764 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
32765 // slower than PSHUFB on targets that support both.
32766 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
32767 // VPPERM Mask Operation
32768 // Bits[4:0] - Byte Index (0 - 31)
32769 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
32770 SmallVector<SDValue, 16> VPPERMMask;
32771 int NumBytes = 16;
32772 int Ratio = NumBytes / NumMaskElts;
32773 for (int i = 0; i < NumBytes; ++i) {
32774 int M = Mask[i / Ratio];
32775 if (M == SM_SentinelUndef) {
32776 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
32777 continue;
32778 }
32779 if (M == SM_SentinelZero) {
32780 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
32781 continue;
32782 }
32783 M = Ratio * M + i % Ratio;
32784 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
32785 }
32786 MVT ByteVT = MVT::v16i8;
32787 V1 = DAG.getBitcast(ByteVT, V1);
32788 V2 = DAG.getBitcast(ByteVT, V2);
32789 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
32790 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
32791 return DAG.getBitcast(RootVT, Res);
32792 }
32793
32794 // If that failed and either input is extracted then try to combine as a
32795 // shuffle with the larger type.
32796 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
32797 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
32798 DAG, Subtarget))
32799 return WideShuffle;
32800
32801 // If we have a dual input shuffle then lower to VPERMV3.
32802 if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
32803 ((Subtarget.hasAVX512() &&
32804 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
32805 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
32806 (Subtarget.hasVLX() &&
32807 (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
32808 MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
32809 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
32810 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
32811 (Subtarget.hasBWI() && Subtarget.hasVLX() &&
32812 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
32813 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
32814 (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
32815 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
32816 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
32817 V1 = DAG.getBitcast(MaskVT, V1);
32818 V2 = DAG.getBitcast(MaskVT, V2);
32819 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
32820 return DAG.getBitcast(RootVT, Res);
32821 }
32822
32823 // Failed to find any combines.
32824 return SDValue();
32825}
32826
32827// Combine an arbitrary chain of shuffles + extract_subvectors into a single
32828// instruction if possible.
32829//
32830// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
32831// type size to attempt to combine:
32832// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
32833// -->
32834// extract_subvector(shuffle(x,y,m2),0)
32835static SDValue combineX86ShuffleChainWithExtract(
32836 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
32837 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
32838 const X86Subtarget &Subtarget) {
32839 unsigned NumMaskElts = BaseMask.size();
32840 unsigned NumInputs = Inputs.size();
32841 if (NumInputs == 0)
32842 return SDValue();
32843
32844 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
32845 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
32846
32847 // Peek through subvectors.
32848 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
32849 unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
32850 for (unsigned i = 0; i != NumInputs; ++i) {
32851 SDValue &Src = WideInputs[i];
32852 unsigned &Offset = Offsets[i];
32853 Src = peekThroughBitcasts(Src);
32854 EVT BaseVT = Src.getValueType();
32855 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
32856 isa<ConstantSDNode>(Src.getOperand(1))) {
32857 Offset += Src.getConstantOperandVal(1);
32858 Src = Src.getOperand(0);
32859 }
32860 WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
32861 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32862, __PRETTY_FUNCTION__))
32862 "Unexpected subvector extraction")(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32862, __PRETTY_FUNCTION__))
;
32863 Offset /= BaseVT.getVectorNumElements();
32864 Offset *= NumMaskElts;
32865 }
32866
32867 // Bail if we're always extracting from the lowest subvectors,
32868 // combineX86ShuffleChain should match this for the current width.
32869 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
32870 return SDValue();
32871
32872 EVT RootVT = Root.getValueType();
32873 unsigned RootSizeInBits = RootVT.getSizeInBits();
32874 unsigned Scale = WideSizeInBits / RootSizeInBits;
32875 assert((WideSizeInBits % RootSizeInBits) == 0 &&(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32876, __PRETTY_FUNCTION__))
32876 "Unexpected subvector extraction")(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32876, __PRETTY_FUNCTION__))
;
32877
32878 // If the src vector types aren't the same, see if we can extend
32879 // them to match each other.
32880 // TODO: Support different scalar types?
32881 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
32882 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
32883 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
32884 Op.getValueType().getScalarType() != WideSVT;
32885 }))
32886 return SDValue();
32887
32888 for (SDValue &NewInput : WideInputs) {
32889 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32890, __PRETTY_FUNCTION__))
32890 "Shuffle vector size mismatch")(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32890, __PRETTY_FUNCTION__))
;
32891 if (WideSizeInBits > NewInput.getValueSizeInBits())
32892 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
32893 SDLoc(NewInput), WideSizeInBits);
32894 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32895, __PRETTY_FUNCTION__))
32895 "Unexpected subvector extraction")((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32895, __PRETTY_FUNCTION__))
;
32896 }
32897
32898 // Create new mask for larger type.
32899 for (unsigned i = 1; i != NumInputs; ++i)
32900 Offsets[i] += i * Scale * NumMaskElts;
32901
32902 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
32903 for (int &M : WideMask) {
32904 if (M < 0)
32905 continue;
32906 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
32907 }
32908 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
32909
32910 // Remove unused/repeated shuffle source ops.
32911 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
32912 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((!WideInputs.empty() && "Shuffle with no inputs detected"
) ? static_cast<void> (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32912, __PRETTY_FUNCTION__))
;
32913
32914 if (WideInputs.size() > 2)
32915 return SDValue();
32916
32917 // Increase depth for every upper subvector we've peeked through.
32918 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
32919
32920 // Attempt to combine wider chain.
32921 // TODO: Can we use a better Root?
32922 SDValue WideRoot = WideInputs[0];
32923 if (SDValue WideShuffle = combineX86ShuffleChain(
32924 WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
32925 AllowVariableMask, DAG, Subtarget)) {
32926 WideShuffle =
32927 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
32928 return DAG.getBitcast(RootVT, WideShuffle);
32929 }
32930 return SDValue();
32931}
32932
32933// Attempt to constant fold all of the constant source ops.
32934// Returns true if the entire shuffle is folded to a constant.
32935// TODO: Extend this to merge multiple constant Ops and update the mask.
32936static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
32937 ArrayRef<int> Mask, SDValue Root,
32938 bool HasVariableMask,
32939 SelectionDAG &DAG,
32940 const X86Subtarget &Subtarget) {
32941 MVT VT = Root.getSimpleValueType();
32942
32943 unsigned SizeInBits = VT.getSizeInBits();
32944 unsigned NumMaskElts = Mask.size();
32945 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
32946 unsigned NumOps = Ops.size();
32947
32948 // Extract constant bits from each source op.
32949 bool OneUseConstantOp = false;
32950 SmallVector<APInt, 16> UndefEltsOps(NumOps);
32951 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
32952 for (unsigned i = 0; i != NumOps; ++i) {
32953 SDValue SrcOp = Ops[i];
32954 OneUseConstantOp |= SrcOp.hasOneUse();
32955 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
32956 RawBitsOps[i]))
32957 return SDValue();
32958 }
32959
32960 // Only fold if at least one of the constants is only used once or
32961 // the combined shuffle has included a variable mask shuffle, this
32962 // is to avoid constant pool bloat.
32963 if (!OneUseConstantOp && !HasVariableMask)
32964 return SDValue();
32965
32966 // Shuffle the constant bits according to the mask.
32967 APInt UndefElts(NumMaskElts, 0);
32968 APInt ZeroElts(NumMaskElts, 0);
32969 APInt ConstantElts(NumMaskElts, 0);
32970 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
32971 APInt::getNullValue(MaskSizeInBits));
32972 for (unsigned i = 0; i != NumMaskElts; ++i) {
32973 int M = Mask[i];
32974 if (M == SM_SentinelUndef) {
32975 UndefElts.setBit(i);
32976 continue;
32977 } else if (M == SM_SentinelZero) {
32978 ZeroElts.setBit(i);
32979 continue;
32980 }
32981 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast
<void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32981, __PRETTY_FUNCTION__))
;
32982
32983 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
32984 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
32985
32986 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
32987 if (SrcUndefElts[SrcMaskIdx]) {
32988 UndefElts.setBit(i);
32989 continue;
32990 }
32991
32992 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
32993 APInt &Bits = SrcEltBits[SrcMaskIdx];
32994 if (!Bits) {
32995 ZeroElts.setBit(i);
32996 continue;
32997 }
32998
32999 ConstantElts.setBit(i);
33000 ConstantBitData[i] = Bits;
33001 }
33002 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()) ? static_cast
<void> (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33002, __PRETTY_FUNCTION__))
;
33003
33004 // Create the constant data.
33005 MVT MaskSVT;
33006 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
33007 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
33008 else
33009 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
33010
33011 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
33012
33013 SDLoc DL(Root);
33014 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
33015 return DAG.getBitcast(VT, CstOp);
33016}
33017
33018/// Fully generic combining of x86 shuffle instructions.
33019///
33020/// This should be the last combine run over the x86 shuffle instructions. Once
33021/// they have been fully optimized, this will recursively consider all chains
33022/// of single-use shuffle instructions, build a generic model of the cumulative
33023/// shuffle operation, and check for simpler instructions which implement this
33024/// operation. We use this primarily for two purposes:
33025///
33026/// 1) Collapse generic shuffles to specialized single instructions when
33027/// equivalent. In most cases, this is just an encoding size win, but
33028/// sometimes we will collapse multiple generic shuffles into a single
33029/// special-purpose shuffle.
33030/// 2) Look for sequences of shuffle instructions with 3 or more total
33031/// instructions, and replace them with the slightly more expensive SSSE3
33032/// PSHUFB instruction if available. We do this as the last combining step
33033/// to ensure we avoid using PSHUFB if we can implement the shuffle with
33034/// a suitable short sequence of other instructions. The PSHUFB will either
33035/// use a register or have to read from memory and so is slightly (but only
33036/// slightly) more expensive than the other shuffle instructions.
33037///
33038/// Because this is inherently a quadratic operation (for each shuffle in
33039/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
33040/// This should never be an issue in practice as the shuffle lowering doesn't
33041/// produce sequences of more than 8 instructions.
33042///
33043/// FIXME: We will currently miss some cases where the redundant shuffling
33044/// would simplify under the threshold for PSHUFB formation because of
33045/// combine-ordering. To fix this, we should do the redundant instruction
33046/// combining in this recursive walk.
33047static SDValue combineX86ShufflesRecursively(
33048 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
33049 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
33050 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
33051 const X86Subtarget &Subtarget) {
33052 assert(RootMask.size() > 0 &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33054, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is false
3
Assuming the condition is true
4
Assuming 'SrcOpIndex' is equal to 0
5
'?' condition is true
33053 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33054, __PRETTY_FUNCTION__))
33054 "Illegal shuffle root mask")((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33054, __PRETTY_FUNCTION__))
;
33055
33056 // Bound the depth of our recursive combine because this is ultimately
33057 // quadratic in nature.
33058 const unsigned MaxRecursionDepth = 8;
33059 if (Depth >= MaxRecursionDepth)
6
Assuming 'Depth' is < 'MaxRecursionDepth'
7
Taking false branch
33060 return SDValue();
33061
33062 // Directly rip through bitcasts to find the underlying operand.
33063 SDValue Op = SrcOps[SrcOpIndex];
33064 Op = peekThroughOneUseBitcasts(Op);
33065
33066 MVT VT = Op.getSimpleValueType();
33067 if (!VT.isVector())
8
Calling 'MVT::isVector'
12
Returning from 'MVT::isVector'
13
Taking false branch
33068 return SDValue(); // Bail if we hit a non-vector.
33069
33070 assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33071, __PRETTY_FUNCTION__))
14
'?' condition is true
33071 "Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33071, __PRETTY_FUNCTION__))
;
33072 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33073, __PRETTY_FUNCTION__))
15
Assuming the condition is true
16
'?' condition is true
33073 "Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33073, __PRETTY_FUNCTION__))
;
33074
33075 // Extract target shuffle mask and resolve sentinels and inputs.
33076 // TODO - determine Op's demanded elts from RootMask.
33077 SmallVector<int, 64> OpMask;
33078 SmallVector<SDValue, 2> OpInputs;
33079 APInt OpUndef, OpZero;
33080 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
33081 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
33082 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
17
Calling 'getTargetShuffleInputs'
33
Returning from 'getTargetShuffleInputs'
34
Taking false branch
33083 OpZero, DAG, Depth, false))
33084 return SDValue();
33085
33086 SmallVector<int, 64> Mask;
33087 SmallVector<SDValue, 16> Ops;
33088
33089 // We don't need to merge masks if the root is empty.
33090 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
35
Assuming 'Depth' is not equal to 0
33091 if (EmptyRoot
35.1
'EmptyRoot' is false
35.1
'EmptyRoot' is false
35.1
'EmptyRoot' is false
35.1
'EmptyRoot' is false
) {
36
Taking false branch
33092 // Only resolve zeros if it will remove an input, otherwise we might end
33093 // up in an infinite loop.
33094 bool ResolveKnownZeros = true;
33095 if (!OpZero.isNullValue()) {
33096 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
33097 for (int i = 0, e = OpMask.size(); i != e; ++i) {
33098 int M = OpMask[i];
33099 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
33100 continue;
33101 UsedInputs.setBit(M / OpMask.size());
33102 if (UsedInputs.isAllOnesValue()) {
33103 ResolveKnownZeros = false;
33104 break;
33105 }
33106 }
33107 }
33108 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
33109 ResolveKnownZeros);
33110
33111 Mask = OpMask;
33112 Ops.append(OpInputs.begin(), OpInputs.end());
33113 } else {
33114 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
33115
33116 // Add the inputs to the Ops list, avoiding duplicates.
33117 Ops.append(SrcOps.begin(), SrcOps.end());
33118
33119 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
33120 // Attempt to find an existing match.
33121 SDValue InputBC = peekThroughBitcasts(Input);
33122 for (int i = 0, e = Ops.size(); i < e; ++i)
33123 if (InputBC == peekThroughBitcasts(Ops[i]))
33124 return i;
33125 // Match failed - should we replace an existing Op?
33126 if (InsertionPoint >= 0) {
33127 Ops[InsertionPoint] = Input;
33128 return InsertionPoint;
33129 }
33130 // Add to the end of the Ops list.
33131 Ops.push_back(Input);
33132 return Ops.size() - 1;
33133 };
33134
33135 SmallVector<int, 2> OpInputIdx;
33136 for (SDValue OpInput : OpInputs)
37
Assuming '__begin2' is equal to '__end2'
33137 OpInputIdx.push_back(
33138 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
33139
33140 assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
38
Assuming the condition is true
39
'?' condition is true
33141 RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
33142 (OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
33143 OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
33144 OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
33145 "The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33145, __PRETTY_FUNCTION__))
;
33146
33147 // This function can be performance-critical, so we rely on the power-of-2
33148 // knowledge that we have about the mask sizes to replace div/rem ops with
33149 // bit-masks and shifts.
33150 assert(isPowerOf2_32(RootMask.size()) &&((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33151, __PRETTY_FUNCTION__))
40
'?' condition is true
33151 "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33151, __PRETTY_FUNCTION__))
;
33152 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33152, __PRETTY_FUNCTION__))
;
41
'?' condition is true
33153 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
42
Calling 'countTrailingZeros<unsigned long>'
49
Returning from 'countTrailingZeros<unsigned long>'
50
'RootMaskSizeLog2' initialized to 64
33154 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
33155
33156 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
33157 unsigned RootRatio =
33158 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
51
The result of the right shift is undefined due to shifting by '64', which is greater or equal to the width of type 'size_t'
33159 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
33160 assert((RootRatio == 1 || OpRatio == 1) &&(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33161, __PRETTY_FUNCTION__))
33161 "Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33161, __PRETTY_FUNCTION__))
;
33162
33163 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33163, __PRETTY_FUNCTION__))
;
33164 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33164, __PRETTY_FUNCTION__))
;
33165 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33165, __PRETTY_FUNCTION__))
;
33166 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
33167 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
33168
33169 Mask.resize(MaskWidth, SM_SentinelUndef);
33170
33171 // Merge this shuffle operation's mask into our accumulated mask. Note that
33172 // this shuffle's mask will be the first applied to the input, followed by
33173 // the root mask to get us all the way to the root value arrangement. The
33174 // reason for this order is that we are recursing up the operation chain.
33175 for (unsigned i = 0; i < MaskWidth; ++i) {
33176 unsigned RootIdx = i >> RootRatioLog2;
33177 if (RootMask[RootIdx] < 0) {
33178 // This is a zero or undef lane, we're done.
33179 Mask[i] = RootMask[RootIdx];
33180 continue;
33181 }
33182
33183 unsigned RootMaskedIdx =
33184 RootRatio == 1
33185 ? RootMask[RootIdx]
33186 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
33187
33188 // Just insert the scaled root mask value if it references an input other
33189 // than the SrcOp we're currently inserting.
33190 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
33191 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
33192 Mask[i] = RootMaskedIdx;
33193 continue;
33194 }
33195
33196 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
33197 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
33198 if (OpMask[OpIdx] < 0) {
33199 // The incoming lanes are zero or undef, it doesn't matter which ones we
33200 // are using.
33201 Mask[i] = OpMask[OpIdx];
33202 continue;
33203 }
33204
33205 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
33206 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
33207 : (OpMask[OpIdx] << OpRatioLog2) +
33208 (RootMaskedIdx & (OpRatio - 1));
33209
33210 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
33211 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
33212 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"
) ? static_cast<void> (0) : __assert_fail ("0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33212, __PRETTY_FUNCTION__))
;
33213 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
33214
33215 Mask[i] = OpMaskedIdx;
33216 }
33217 }
33218
33219 // Remove unused/repeated shuffle source ops.
33220 resolveTargetShuffleInputsAndMask(Ops, Mask);
33221
33222 // Handle the all undef/zero cases early.
33223 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
33224 return DAG.getUNDEF(Root.getValueType());
33225
33226 // TODO - should we handle the mixed zero/undef case as well? Just returning
33227 // a zero mask will lose information on undef elements possibly reducing
33228 // future combine possibilities.
33229 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
33230 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
33231 SDLoc(Root));
33232
33233 assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ?
static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33233, __PRETTY_FUNCTION__))
;
33234 HasVariableMask |= IsOpVariableMask;
33235
33236 // Update the list of shuffle nodes that have been combined so far.
33237 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
33238 SrcNodes.end());
33239 CombinedNodes.push_back(Op.getNode());
33240
33241 // See if we can recurse into each shuffle source op (if it's a target
33242 // shuffle). The source op should only be generally combined if it either has
33243 // a single use (i.e. current Op) or all its users have already been combined,
33244 // if not then we can still combine but should prevent generation of variable
33245 // shuffles to avoid constant pool bloat.
33246 // Don't recurse if we already have more source ops than we can combine in
33247 // the remaining recursion depth.
33248 if (Ops.size() < (MaxRecursionDepth - Depth)) {
33249 for (int i = 0, e = Ops.size(); i < e; ++i) {
33250 // For empty roots, we need to resolve zeroable elements before combining
33251 // them with other shuffles.
33252 SmallVector<int, 64> ResolvedMask = Mask;
33253 if (EmptyRoot)
33254 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
33255 bool AllowVar = false;
33256 if (Ops[i].getNode()->hasOneUse() ||
33257 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
33258 AllowVar = AllowVariableMask;
33259 if (SDValue Res = combineX86ShufflesRecursively(
33260 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
33261 HasVariableMask, AllowVar, DAG, Subtarget))
33262 return Res;
33263 }
33264 }
33265
33266 // Attempt to constant fold all of the constant source ops.
33267 if (SDValue Cst = combineX86ShufflesConstants(
33268 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
33269 return Cst;
33270
33271 // We can only combine unary and binary shuffle mask cases.
33272 if (Ops.size() <= 2) {
33273 // Minor canonicalization of the accumulated shuffle mask to make it easier
33274 // to match below. All this does is detect masks with sequential pairs of
33275 // elements, and shrink them to the half-width mask. It does this in a loop
33276 // so it will reduce the size of the mask to the minimal width mask which
33277 // performs an equivalent shuffle.
33278 SmallVector<int, 64> WidenedMask;
33279 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
33280 Mask = std::move(WidenedMask);
33281 }
33282
33283 // Canonicalization of binary shuffle masks to improve pattern matching by
33284 // commuting the inputs.
33285 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
33286 ShuffleVectorSDNode::commuteMask(Mask);
33287 std::swap(Ops[0], Ops[1]);
33288 }
33289
33290 // Finally, try to combine into a single shuffle instruction.
33291 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
33292 AllowVariableMask, DAG, Subtarget);
33293 }
33294
33295 // If that failed and any input is extracted then try to combine as a
33296 // shuffle with the larger type.
33297 return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
33298 HasVariableMask, AllowVariableMask,
33299 DAG, Subtarget);
33300}
33301
33302/// Helper entry wrapper to combineX86ShufflesRecursively.
33303static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
33304 const X86Subtarget &Subtarget) {
33305 return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
33306 /*HasVarMask*/ false,
33307 /*AllowVarMask*/ true, DAG, Subtarget);
33308}
33309
33310/// Get the PSHUF-style mask from PSHUF node.
33311///
33312/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
33313/// PSHUF-style masks that can be reused with such instructions.
33314static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
33315 MVT VT = N.getSimpleValueType();
33316 SmallVector<int, 4> Mask;
33317 SmallVector<SDValue, 2> Ops;
33318 bool IsUnary;
33319 bool HaveMask =
33320 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
33321 (void)HaveMask;
33322 assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33322, __PRETTY_FUNCTION__))
;
33323
33324 // If we have more than 128-bits, only the low 128-bits of shuffle mask
33325 // matter. Check that the upper masks are repeats and remove them.
33326 if (VT.getSizeInBits() > 128) {
33327 int LaneElts = 128 / VT.getScalarSizeInBits();
33328#ifndef NDEBUG
33329 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
33330 for (int j = 0; j < LaneElts; ++j)
33331 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33332, __PRETTY_FUNCTION__))
33332 "Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33332, __PRETTY_FUNCTION__))
;
33333#endif
33334 Mask.resize(LaneElts);
33335 }
33336
33337 switch (N.getOpcode()) {
33338 case X86ISD::PSHUFD:
33339 return Mask;
33340 case X86ISD::PSHUFLW:
33341 Mask.resize(4);
33342 return Mask;
33343 case X86ISD::PSHUFHW:
33344 Mask.erase(Mask.begin(), Mask.begin() + 4);
33345 for (int &M : Mask)
33346 M -= 4;
33347 return Mask;
33348 default:
33349 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33349)
;
33350 }
33351}
33352
33353/// Search for a combinable shuffle across a chain ending in pshufd.
33354///
33355/// We walk up the chain and look for a combinable shuffle, skipping over
33356/// shuffles that we could hoist this shuffle's transformation past without
33357/// altering anything.
33358static SDValue
33359combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
33360 SelectionDAG &DAG) {
33361 assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33362, __PRETTY_FUNCTION__))
33362 "Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33362, __PRETTY_FUNCTION__))
;
33363 SDLoc DL(N);
33364
33365 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
33366 // of the shuffles in the chain so that we can form a fresh chain to replace
33367 // this one.
33368 SmallVector<SDValue, 8> Chain;
33369 SDValue V = N.getOperand(0);
33370 for (; V.hasOneUse(); V = V.getOperand(0)) {
33371 switch (V.getOpcode()) {
33372 default:
33373 return SDValue(); // Nothing combined!
33374
33375 case ISD::BITCAST:
33376 // Skip bitcasts as we always know the type for the target specific
33377 // instructions.
33378 continue;
33379
33380 case X86ISD::PSHUFD:
33381 // Found another dword shuffle.
33382 break;
33383
33384 case X86ISD::PSHUFLW:
33385 // Check that the low words (being shuffled) are the identity in the
33386 // dword shuffle, and the high words are self-contained.
33387 if (Mask[0] != 0 || Mask[1] != 1 ||
33388 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
33389 return SDValue();
33390
33391 Chain.push_back(V);
33392 continue;
33393
33394 case X86ISD::PSHUFHW:
33395 // Check that the high words (being shuffled) are the identity in the
33396 // dword shuffle, and the low words are self-contained.
33397 if (Mask[2] != 2 || Mask[3] != 3 ||
33398 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
33399 return SDValue();
33400
33401 Chain.push_back(V);
33402 continue;
33403
33404 case X86ISD::UNPCKL:
33405 case X86ISD::UNPCKH:
33406 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
33407 // shuffle into a preceding word shuffle.
33408 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
33409 V.getSimpleValueType().getVectorElementType() != MVT::i16)
33410 return SDValue();
33411
33412 // Search for a half-shuffle which we can combine with.
33413 unsigned CombineOp =
33414 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
33415 if (V.getOperand(0) != V.getOperand(1) ||
33416 !V->isOnlyUserOf(V.getOperand(0).getNode()))
33417 return SDValue();
33418 Chain.push_back(V);
33419 V = V.getOperand(0);
33420 do {
33421 switch (V.getOpcode()) {
33422 default:
33423 return SDValue(); // Nothing to combine.
33424
33425 case X86ISD::PSHUFLW:
33426 case X86ISD::PSHUFHW:
33427 if (V.getOpcode() == CombineOp)
33428 break;
33429
33430 Chain.push_back(V);
33431
33432 LLVM_FALLTHROUGH[[gnu::fallthrough]];
33433 case ISD::BITCAST:
33434 V = V.getOperand(0);
33435 continue;
33436 }
33437 break;
33438 } while (V.hasOneUse());
33439 break;
33440 }
33441 // Break out of the loop if we break out of the switch.
33442 break;
33443 }
33444
33445 if (!V.hasOneUse())
33446 // We fell out of the loop without finding a viable combining instruction.
33447 return SDValue();
33448
33449 // Merge this node's mask and our incoming mask.
33450 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
33451 for (int &M : Mask)
33452 M = VMask[M];
33453 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
33454 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
33455
33456 // Rebuild the chain around this new shuffle.
33457 while (!Chain.empty()) {
33458 SDValue W = Chain.pop_back_val();
33459
33460 if (V.getValueType() != W.getOperand(0).getValueType())
33461 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
33462
33463 switch (W.getOpcode()) {
33464 default:
33465 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33465)
;
33466
33467 case X86ISD::UNPCKL:
33468 case X86ISD::UNPCKH:
33469 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
33470 break;
33471
33472 case X86ISD::PSHUFD:
33473 case X86ISD::PSHUFLW:
33474 case X86ISD::PSHUFHW:
33475 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
33476 break;
33477 }
33478 }
33479 if (V.getValueType() != N.getValueType())
33480 V = DAG.getBitcast(N.getValueType(), V);
33481
33482 // Return the new chain to replace N.
33483 return V;
33484}
33485
33486/// Try to combine x86 target specific shuffles.
33487static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
33488 TargetLowering::DAGCombinerInfo &DCI,
33489 const X86Subtarget &Subtarget) {
33490 SDLoc DL(N);
33491 MVT VT = N.getSimpleValueType();
33492 SmallVector<int, 4> Mask;
33493 unsigned Opcode = N.getOpcode();
33494
33495 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
33496 // single instruction.
33497 if (VT.getScalarSizeInBits() == 64 &&
33498 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
33499 Opcode == X86ISD::UNPCKL)) {
33500 auto BC0 = peekThroughBitcasts(N.getOperand(0));
33501 auto BC1 = peekThroughBitcasts(N.getOperand(1));
33502 EVT VT0 = BC0.getValueType();
33503 EVT VT1 = BC1.getValueType();
33504 unsigned Opcode0 = BC0.getOpcode();
33505 unsigned Opcode1 = BC1.getOpcode();
33506 if (Opcode0 == Opcode1 && VT0 == VT1 &&
33507 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
33508 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
33509 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
33510 SDValue Lo, Hi;
33511 if (Opcode == X86ISD::MOVSD) {
33512 Lo = BC1.getOperand(0);
33513 Hi = BC0.getOperand(1);
33514 } else {
33515 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
33516 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
33517 }
33518 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
33519 return DAG.getBitcast(VT, Horiz);
33520 }
33521 }
33522
33523 switch (Opcode) {
33524 case X86ISD::VBROADCAST: {
33525 SDValue Src = N.getOperand(0);
33526 SDValue BC = peekThroughBitcasts(Src);
33527 EVT SrcVT = Src.getValueType();
33528 EVT BCVT = BC.getValueType();
33529
33530 // If broadcasting from another shuffle, attempt to simplify it.
33531 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
33532 if (isTargetShuffle(BC.getOpcode()) &&
33533 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
33534 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
33535 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
33536 SM_SentinelUndef);
33537 for (unsigned i = 0; i != Scale; ++i)
33538 DemandedMask[i] = i;
33539 if (SDValue Res = combineX86ShufflesRecursively(
33540 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
33541 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
33542 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
33543 DAG.getBitcast(SrcVT, Res));
33544 }
33545
33546 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
33547 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
33548 if (Src.getOpcode() == ISD::BITCAST &&
33549 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
33550 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
33551 VT.getVectorNumElements());
33552 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
33553 }
33554
33555 // Reduce broadcast source vector to lowest 128-bits.
33556 if (SrcVT.getSizeInBits() > 128)
33557 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
33558 extract128BitVector(Src, 0, DAG, DL));
33559
33560 // broadcast(scalar_to_vector(x)) -> broadcast(x).
33561 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
33562 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
33563
33564 // Share broadcast with the longest vector and extract low subvector (free).
33565 for (SDNode *User : Src->uses())
33566 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
33567 User->getValueSizeInBits(0) > VT.getSizeInBits()) {
33568 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
33569 VT.getSizeInBits());
33570 }
33571
33572 // vbroadcast(scalarload X) -> vbroadcast_load X
33573 // For float loads, extract other uses of the scalar from the broadcast.
33574 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
33575 ISD::isNormalLoad(Src.getNode())) {
33576 LoadSDNode *LN = cast<LoadSDNode>(Src);
33577 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
33578 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
33579 SDValue BcastLd =
33580 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
33581 LN->getMemoryVT(), LN->getMemOperand());
33582 // If the load value is used only by N, replace it via CombineTo N.
33583 bool NoReplaceExtract = Src.hasOneUse();
33584 DCI.CombineTo(N.getNode(), BcastLd);
33585 if (NoReplaceExtract) {
33586 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
33587 DCI.recursivelyDeleteUnusedNodes(LN);
33588 } else {
33589 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
33590 DAG.getIntPtrConstant(0, DL));
33591 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
33592 }
33593 return N; // Return N so it doesn't get rechecked!
33594 }
33595
33596 return SDValue();
33597 }
33598 case X86ISD::BLENDI: {
33599 SDValue N0 = N.getOperand(0);
33600 SDValue N1 = N.getOperand(1);
33601
33602 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
33603 // TODO: Handle MVT::v16i16 repeated blend mask.
33604 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33605 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
33606 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
33607 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
33608 SrcVT.getScalarSizeInBits() >= 32) {
33609 unsigned BlendMask = N.getConstantOperandVal(2);
33610 unsigned Size = VT.getVectorNumElements();
33611 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
33612 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
33613 return DAG.getBitcast(
33614 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
33615 N1.getOperand(0),
33616 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
33617 }
33618 }
33619 return SDValue();
33620 }
33621 case X86ISD::VPERMI: {
33622 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
33623 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
33624 SDValue N0 = N.getOperand(0);
33625 SDValue N1 = N.getOperand(1);
33626 unsigned EltSizeInBits = VT.getScalarSizeInBits();
33627 if (N0.getOpcode() == ISD::BITCAST &&
33628 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
33629 SDValue Src = N0.getOperand(0);
33630 EVT SrcVT = Src.getValueType();
33631 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
33632 return DAG.getBitcast(VT, Res);
33633 }
33634 return SDValue();
33635 }
33636 case X86ISD::PSHUFD:
33637 case X86ISD::PSHUFLW:
33638 case X86ISD::PSHUFHW:
33639 Mask = getPSHUFShuffleMask(N);
33640 assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33640, __PRETTY_FUNCTION__))
;
33641 break;
33642 case X86ISD::MOVSD:
33643 case X86ISD::MOVSS: {
33644 SDValue N0 = N.getOperand(0);
33645 SDValue N1 = N.getOperand(1);
33646
33647 // Canonicalize scalar FPOps:
33648 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
33649 // If commutable, allow OP(N1[0], N0[0]).
33650 unsigned Opcode1 = N1.getOpcode();
33651 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
33652 Opcode1 == ISD::FDIV) {
33653 SDValue N10 = N1.getOperand(0);
33654 SDValue N11 = N1.getOperand(1);
33655 if (N10 == N0 ||
33656 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
33657 if (N10 != N0)
33658 std::swap(N10, N11);
33659 MVT SVT = VT.getVectorElementType();
33660 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
33661 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
33662 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
33663 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
33664 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
33665 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
33666 }
33667 }
33668
33669 return SDValue();
33670 }
33671 case X86ISD::INSERTPS: {
33672 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33672, __PRETTY_FUNCTION__))
;
33673 SDValue Op0 = N.getOperand(0);
33674 SDValue Op1 = N.getOperand(1);
33675 SDValue Op2 = N.getOperand(2);
33676 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
33677 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
33678 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
33679 unsigned ZeroMask = InsertPSMask & 0xF;
33680
33681 // If we zero out all elements from Op0 then we don't need to reference it.
33682 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
33683 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
33684 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
33685
33686 // If we zero out the element from Op1 then we don't need to reference it.
33687 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
33688 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
33689 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
33690
33691 // Attempt to merge insertps Op1 with an inner target shuffle node.
33692 SmallVector<int, 8> TargetMask1;
33693 SmallVector<SDValue, 2> Ops1;
33694 APInt KnownUndef1, KnownZero1;
33695 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
33696 KnownZero1)) {
33697 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
33698 // Zero/UNDEF insertion - zero out element and remove dependency.
33699 InsertPSMask |= (1u << DstIdx);
33700 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
33701 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
33702 }
33703 // Update insertps mask srcidx and reference the source input directly.
33704 int M = TargetMask1[SrcIdx];
33705 assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33705, __PRETTY_FUNCTION__))
;
33706 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
33707 Op1 = Ops1[M < 4 ? 0 : 1];
33708 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
33709 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
33710 }
33711
33712 // Attempt to merge insertps Op0 with an inner target shuffle node.
33713 SmallVector<int, 8> TargetMask0;
33714 SmallVector<SDValue, 2> Ops0;
33715 APInt KnownUndef0, KnownZero0;
33716 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
33717 KnownZero0)) {
33718 bool Updated = false;
33719 bool UseInput00 = false;
33720 bool UseInput01 = false;
33721 for (int i = 0; i != 4; ++i) {
33722 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
33723 // No change if element is already zero or the inserted element.
33724 continue;
33725 } else if (KnownUndef0[i] || KnownZero0[i]) {
33726 // If the target mask is undef/zero then we must zero the element.
33727 InsertPSMask |= (1u << i);
33728 Updated = true;
33729 continue;
33730 }
33731
33732 // The input vector element must be inline.
33733 int M = TargetMask0[i];
33734 if (M != i && M != (i + 4))
33735 return SDValue();
33736
33737 // Determine which inputs of the target shuffle we're using.
33738 UseInput00 |= (0 <= M && M < 4);
33739 UseInput01 |= (4 <= M);
33740 }
33741
33742 // If we're not using both inputs of the target shuffle then use the
33743 // referenced input directly.
33744 if (UseInput00 && !UseInput01) {
33745 Updated = true;
33746 Op0 = Ops0[0];
33747 } else if (!UseInput00 && UseInput01) {
33748 Updated = true;
33749 Op0 = Ops0[1];
33750 }
33751
33752 if (Updated)
33753 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
33754 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
33755 }
33756
33757 // If we're inserting an element from a vbroadcast load, fold the
33758 // load into the X86insertps instruction. We need to convert the scalar
33759 // load to a vector and clear the source lane of the INSERTPS control.
33760 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
33761 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
33762 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
33763 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
33764 MemIntr->getBasePtr(),
33765 MemIntr->getMemOperand());
33766 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
33767 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
33768 Load),
33769 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
33770 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
33771 return Insert;
33772 }
33773 }
33774
33775 return SDValue();
33776 }
33777 default:
33778 return SDValue();
33779 }
33780
33781 // Nuke no-op shuffles that show up after combining.
33782 if (isNoopShuffleMask(Mask))
33783 return N.getOperand(0);
33784
33785 // Look for simplifications involving one or two shuffle instructions.
33786 SDValue V = N.getOperand(0);
33787 switch (N.getOpcode()) {
33788 default:
33789 break;
33790 case X86ISD::PSHUFLW:
33791 case X86ISD::PSHUFHW:
33792 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33792, __PRETTY_FUNCTION__))
;
33793
33794 // See if this reduces to a PSHUFD which is no more expensive and can
33795 // combine with more operations. Note that it has to at least flip the
33796 // dwords as otherwise it would have been removed as a no-op.
33797 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
33798 int DMask[] = {0, 1, 2, 3};
33799 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
33800 DMask[DOffset + 0] = DOffset + 1;
33801 DMask[DOffset + 1] = DOffset + 0;
33802 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
33803 V = DAG.getBitcast(DVT, V);
33804 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
33805 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
33806 return DAG.getBitcast(VT, V);
33807 }
33808
33809 // Look for shuffle patterns which can be implemented as a single unpack.
33810 // FIXME: This doesn't handle the location of the PSHUFD generically, and
33811 // only works when we have a PSHUFD followed by two half-shuffles.
33812 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
33813 (V.getOpcode() == X86ISD::PSHUFLW ||
33814 V.getOpcode() == X86ISD::PSHUFHW) &&
33815 V.getOpcode() != N.getOpcode() &&
33816 V.hasOneUse()) {
33817 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
33818 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
33819 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
33820 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
33821 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
33822 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
33823 int WordMask[8];
33824 for (int i = 0; i < 4; ++i) {
33825 WordMask[i + NOffset] = Mask[i] + NOffset;
33826 WordMask[i + VOffset] = VMask[i] + VOffset;
33827 }
33828 // Map the word mask through the DWord mask.
33829 int MappedMask[8];
33830 for (int i = 0; i < 8; ++i)
33831 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
33832 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
33833 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
33834 // We can replace all three shuffles with an unpack.
33835 V = DAG.getBitcast(VT, D.getOperand(0));
33836 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
33837 : X86ISD::UNPCKH,
33838 DL, VT, V, V);
33839 }
33840 }
33841 }
33842
33843 break;
33844
33845 case X86ISD::PSHUFD:
33846 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
33847 return NewN;
33848
33849 break;
33850 }
33851
33852 return SDValue();
33853}
33854
33855/// Checks if the shuffle mask takes subsequent elements
33856/// alternately from two vectors.
33857/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
33858static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
33859
33860 int ParitySrc[2] = {-1, -1};
33861 unsigned Size = Mask.size();
33862 for (unsigned i = 0; i != Size; ++i) {
33863 int M = Mask[i];
33864 if (M < 0)
33865 continue;
33866
33867 // Make sure we are using the matching element from the input.
33868 if ((M % Size) != i)
33869 return false;
33870
33871 // Make sure we use the same input for all elements of the same parity.
33872 int Src = M / Size;
33873 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
33874 return false;
33875 ParitySrc[i % 2] = Src;
33876 }
33877
33878 // Make sure each input is used.
33879 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
33880 return false;
33881
33882 Op0Even = ParitySrc[0] == 0;
33883 return true;
33884}
33885
33886/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
33887/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
33888/// are written to the parameters \p Opnd0 and \p Opnd1.
33889///
33890/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
33891/// so it is easier to generically match. We also insert dummy vector shuffle
33892/// nodes for the operands which explicitly discard the lanes which are unused
33893/// by this operation to try to flow through the rest of the combiner
33894/// the fact that they're unused.
33895static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
33896 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
33897 bool &IsSubAdd) {
33898
33899 EVT VT = N->getValueType(0);
33900 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33901 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
33902 !VT.getSimpleVT().isFloatingPoint())
33903 return false;
33904
33905 // We only handle target-independent shuffles.
33906 // FIXME: It would be easy and harmless to use the target shuffle mask
33907 // extraction tool to support more.
33908 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
33909 return false;
33910
33911 SDValue V1 = N->getOperand(0);
33912 SDValue V2 = N->getOperand(1);
33913
33914 // Make sure we have an FADD and an FSUB.
33915 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
33916 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
33917 V1.getOpcode() == V2.getOpcode())
33918 return false;
33919
33920 // If there are other uses of these operations we can't fold them.
33921 if (!V1->hasOneUse() || !V2->hasOneUse())
33922 return false;
33923
33924 // Ensure that both operations have the same operands. Note that we can
33925 // commute the FADD operands.
33926 SDValue LHS, RHS;
33927 if (V1.getOpcode() == ISD::FSUB) {
33928 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
33929 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
33930 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
33931 return false;
33932 } else {
33933 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((V2.getOpcode() == ISD::FSUB && "Unexpected opcode")
? static_cast<void> (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33933, __PRETTY_FUNCTION__))
;
33934 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
33935 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
33936 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
33937 return false;
33938 }
33939
33940 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
33941 bool Op0Even;
33942 if (!isAddSubOrSubAddMask(Mask, Op0Even))
33943 return false;
33944
33945 // It's a subadd if the vector in the even parity is an FADD.
33946 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
33947 : V2->getOpcode() == ISD::FADD;
33948
33949 Opnd0 = LHS;
33950 Opnd1 = RHS;
33951 return true;
33952}
33953
33954/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
33955static SDValue combineShuffleToFMAddSub(SDNode *N,
33956 const X86Subtarget &Subtarget,
33957 SelectionDAG &DAG) {
33958 // We only handle target-independent shuffles.
33959 // FIXME: It would be easy and harmless to use the target shuffle mask
33960 // extraction tool to support more.
33961 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
33962 return SDValue();
33963
33964 MVT VT = N->getSimpleValueType(0);
33965 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33966 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
33967 return SDValue();
33968
33969 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
33970 SDValue Op0 = N->getOperand(0);
33971 SDValue Op1 = N->getOperand(1);
33972 SDValue FMAdd = Op0, FMSub = Op1;
33973 if (FMSub.getOpcode() != X86ISD::FMSUB)
33974 std::swap(FMAdd, FMSub);
33975
33976 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
33977 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
33978 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
33979 FMAdd.getOperand(2) != FMSub.getOperand(2))
33980 return SDValue();
33981
33982 // Check for correct shuffle mask.
33983 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
33984 bool Op0Even;
33985 if (!isAddSubOrSubAddMask(Mask, Op0Even))
33986 return SDValue();
33987
33988 // FMAddSub takes zeroth operand from FMSub node.
33989 SDLoc DL(N);
33990 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
33991 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
33992 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
33993 FMAdd.getOperand(2));
33994}
33995
33996/// Try to combine a shuffle into a target-specific add-sub or
33997/// mul-add-sub node.
33998static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
33999 const X86Subtarget &Subtarget,
34000 SelectionDAG &DAG) {
34001 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
34002 return V;
34003
34004 SDValue Opnd0, Opnd1;
34005 bool IsSubAdd;
34006 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
34007 return SDValue();
34008
34009 MVT VT = N->getSimpleValueType(0);
34010 SDLoc DL(N);
34011
34012 // Try to generate X86ISD::FMADDSUB node here.
34013 SDValue Opnd2;
34014 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
34015 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
34016 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
34017 }
34018
34019 if (IsSubAdd)
34020 return SDValue();
34021
34022 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
34023 // the ADDSUB idiom has been successfully recognized. There are no known
34024 // X86 targets with 512-bit ADDSUB instructions!
34025 if (VT.is512BitVector())
34026 return SDValue();
34027
34028 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
34029}
34030
34031// We are looking for a shuffle where both sources are concatenated with undef
34032// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
34033// if we can express this as a single-source shuffle, that's preferable.
34034static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
34035 const X86Subtarget &Subtarget) {
34036 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
34037 return SDValue();
34038
34039 EVT VT = N->getValueType(0);
34040
34041 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
34042 if (!VT.is128BitVector() && !VT.is256BitVector())
34043 return SDValue();
34044
34045 if (VT.getVectorElementType() != MVT::i32 &&
34046 VT.getVectorElementType() != MVT::i64 &&
34047 VT.getVectorElementType() != MVT::f32 &&
34048 VT.getVectorElementType() != MVT::f64)
34049 return SDValue();
34050
34051 SDValue N0 = N->getOperand(0);
34052 SDValue N1 = N->getOperand(1);
34053
34054 // Check that both sources are concats with undef.
34055 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
34056 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
34057 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
34058 !N1.getOperand(1).isUndef())
34059 return SDValue();
34060
34061 // Construct the new shuffle mask. Elements from the first source retain their
34062 // index, but elements from the second source no longer need to skip an undef.
34063 SmallVector<int, 8> Mask;
34064 int NumElts = VT.getVectorNumElements();
34065
34066 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
34067 for (int Elt : SVOp->getMask())
34068 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
34069
34070 SDLoc DL(N);
34071 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
34072 N1.getOperand(0));
34073 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
34074}
34075
34076/// Eliminate a redundant shuffle of a horizontal math op.
34077static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
34078 unsigned Opcode = N->getOpcode();
34079 if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
34080 if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
34081 return SDValue();
34082
34083 // For a broadcast, peek through an extract element of index 0 to find the
34084 // horizontal op: broadcast (ext_vec_elt HOp, 0)
34085 EVT VT = N->getValueType(0);
34086 if (Opcode == X86ISD::VBROADCAST) {
34087 SDValue SrcOp = N->getOperand(0);
34088 if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34089 SrcOp.getValueType() == MVT::f64 &&
34090 SrcOp.getOperand(0).getValueType() == VT &&
34091 isNullConstant(SrcOp.getOperand(1)))
34092 N = SrcOp.getNode();
34093 }
34094
34095 SDValue HOp = N->getOperand(0);
34096 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
34097 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
34098 return SDValue();
34099
34100 // 128-bit horizontal math instructions are defined to operate on adjacent
34101 // lanes of each operand as:
34102 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
34103 // ...similarly for v2f64 and v8i16.
34104 if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
34105 HOp.getOperand(0) != HOp.getOperand(1))
34106 return SDValue();
34107
34108 // The shuffle that we are eliminating may have allowed the horizontal op to
34109 // have an undemanded (undefined) operand. Duplicate the other (defined)
34110 // operand to ensure that the results are defined across all lanes without the
34111 // shuffle.
34112 auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
34113 SDValue X;
34114 if (HorizOp.getOperand(0).isUndef()) {
34115 assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(1).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34115, __PRETTY_FUNCTION__))
;
34116 X = HorizOp.getOperand(1);
34117 } else if (HorizOp.getOperand(1).isUndef()) {
34118 assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(0).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34118, __PRETTY_FUNCTION__))
;
34119 X = HorizOp.getOperand(0);
34120 } else {
34121 return HorizOp;
34122 }
34123 return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
34124 HorizOp.getValueType(), X, X);
34125 };
34126
34127 // When the operands of a horizontal math op are identical, the low half of
34128 // the result is the same as the high half. If a target shuffle is also
34129 // replicating low and high halves (and without changing the type/length of
34130 // the vector), we don't need the shuffle.
34131 if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
34132 if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
34133 // movddup (hadd X, X) --> hadd X, X
34134 // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
34135 assert((HOp.getValueType() == MVT::v2f64 ||(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34136, __PRETTY_FUNCTION__))
34136 HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op")(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34136, __PRETTY_FUNCTION__))
;
34137 return updateHOp(HOp, DAG);
34138 }
34139 return SDValue();
34140 }
34141
34142 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
34143 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
34144 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
34145 // but this should be tied to whatever horizontal op matching and shuffle
34146 // canonicalization are producing.
34147 if (HOp.getValueSizeInBits() == 128 &&
34148 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
34149 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
34150 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
34151 return updateHOp(HOp, DAG);
34152
34153 if (HOp.getValueSizeInBits() == 256 &&
34154 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
34155 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
34156 isTargetShuffleEquivalent(
34157 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
34158 return updateHOp(HOp, DAG);
34159
34160 return SDValue();
34161}
34162
34163/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
34164/// low half of each source vector and does not set any high half elements in
34165/// the destination vector, narrow the shuffle to half its original size.
34166static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
34167 if (!Shuf->getValueType(0).isSimple())
34168 return SDValue();
34169 MVT VT = Shuf->getSimpleValueType(0);
34170 if (!VT.is256BitVector() && !VT.is512BitVector())
34171 return SDValue();
34172
34173 // See if we can ignore all of the high elements of the shuffle.
34174 ArrayRef<int> Mask = Shuf->getMask();
34175 if (!isUndefUpperHalf(Mask))
34176 return SDValue();
34177
34178 // Check if the shuffle mask accesses only the low half of each input vector
34179 // (half-index output is 0 or 2).
34180 int HalfIdx1, HalfIdx2;
34181 SmallVector<int, 8> HalfMask(Mask.size() / 2);
34182 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
34183 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
34184 return SDValue();
34185
34186 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
34187 // The trick is knowing that all of the insert/extract are actually free
34188 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
34189 // of narrow inputs into a narrow output, and that is always cheaper than
34190 // the wide shuffle that we started with.
34191 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
34192 Shuf->getOperand(1), HalfMask, HalfIdx1,
34193 HalfIdx2, false, DAG, /*UseConcat*/true);
34194}
34195
34196static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
34197 TargetLowering::DAGCombinerInfo &DCI,
34198 const X86Subtarget &Subtarget) {
34199 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
34200 if (SDValue V = narrowShuffle(Shuf, DAG))
34201 return V;
34202
34203 // If we have legalized the vector types, look for blends of FADD and FSUB
34204 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
34205 SDLoc dl(N);
34206 EVT VT = N->getValueType(0);
34207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34208 if (TLI.isTypeLegal(VT)) {
34209 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
34210 return AddSub;
34211
34212 if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
34213 return HAddSub;
34214 }
34215
34216 // Attempt to combine into a vector load/broadcast.
34217 if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
34218 return LD;
34219
34220 // For AVX2, we sometimes want to combine
34221 // (vector_shuffle <mask> (concat_vectors t1, undef)
34222 // (concat_vectors t2, undef))
34223 // Into:
34224 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
34225 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
34226 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
34227 return ShufConcat;
34228
34229 if (isTargetShuffle(N->getOpcode())) {
34230 SDValue Op(N, 0);
34231 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
34232 return Shuffle;
34233
34234 // Try recursively combining arbitrary sequences of x86 shuffle
34235 // instructions into higher-order shuffles. We do this after combining
34236 // specific PSHUF instruction sequences into their minimal form so that we
34237 // can evaluate how many specialized shuffle instructions are involved in
34238 // a particular chain.
34239 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
34240 return Res;
34241
34242 // Simplify source operands based on shuffle mask.
34243 // TODO - merge this into combineX86ShufflesRecursively.
34244 APInt KnownUndef, KnownZero;
34245 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
34246 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
34247 return SDValue(N, 0);
34248 }
34249
34250 // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
34251 // in the upper 64 bits.
34252 // TODO: Can we generalize this using computeKnownBits.
34253 if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
34254 (VT == MVT::v2f64 || VT == MVT::v2i64) &&
34255 N->getOperand(0).getOpcode() == ISD::BITCAST &&
34256 (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
34257 N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
34258 SDValue In = N->getOperand(0).getOperand(0);
34259 switch (In.getOpcode()) {
34260 default:
34261 break;
34262 case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
34263 case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
34264 case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
34265 case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
34266 case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
34267 case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
34268 case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
34269 if (In.getOperand(0).getValueType() == MVT::v2f64 ||
34270 In.getOperand(0).getValueType() == MVT::v2i64)
34271 return N->getOperand(0); // return the bitcast
34272 break;
34273 }
34274 }
34275
34276 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
34277 // insert into a zero vector. This helps get VZEXT_MOVL closer to
34278 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
34279 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
34280 if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
34281 N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
34282 N->getOperand(0).hasOneUse() &&
34283 N->getOperand(0).getOperand(0).isUndef() &&
34284 isNullConstant(N->getOperand(0).getOperand(2))) {
34285 SDValue In = N->getOperand(0).getOperand(1);
34286 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
34287 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
34288 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
34289 Movl, N->getOperand(0).getOperand(2));
34290 }
34291
34292 // If this a vzmovl of a full vector load, replace it with a vzload, unless
34293 // the load is volatile.
34294 if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
34295 ISD::isNormalLoad(N->getOperand(0).getNode())) {
34296 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
34297 if (LN->isSimple()) {
34298 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
34299 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
34300 SDValue VZLoad =
34301 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34302 VT.getVectorElementType(),
34303 LN->getPointerInfo(),
34304 LN->getAlignment(),
34305 MachineMemOperand::MOLoad);
34306 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
34307 return VZLoad;
34308 }
34309 }
34310
34311 return SDValue();
34312}
34313
34314bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
34315 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
34316 TargetLoweringOpt &TLO, unsigned Depth) const {
34317 int NumElts = DemandedElts.getBitWidth();
34318 unsigned Opc = Op.getOpcode();
34319 EVT VT = Op.getValueType();
34320
34321 // Handle special case opcodes.
34322 switch (Opc) {
34323 case X86ISD::PMULDQ:
34324 case X86ISD::PMULUDQ: {
34325 APInt LHSUndef, LHSZero;
34326 APInt RHSUndef, RHSZero;
34327 SDValue LHS = Op.getOperand(0);
34328 SDValue RHS = Op.getOperand(1);
34329 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
34330 Depth + 1))
34331 return true;
34332 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
34333 Depth + 1))
34334 return true;
34335 // Multiply by zero.
34336 KnownZero = LHSZero | RHSZero;
34337 break;
34338 }
34339 case X86ISD::VSHL:
34340 case X86ISD::VSRL:
34341 case X86ISD::VSRA: {
34342 // We only need the bottom 64-bits of the (128-bit) shift amount.
34343 SDValue Amt = Op.getOperand(1);
34344 MVT AmtVT = Amt.getSimpleValueType();
34345 assert(AmtVT.is128BitVector() && "Unexpected value type")((AmtVT.is128BitVector() && "Unexpected value type") ?
static_cast<void> (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34345, __PRETTY_FUNCTION__))
;
34346
34347 // If we reuse the shift amount just for sse shift amounts then we know that
34348 // only the bottom 64-bits are only ever used.
34349 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
34350 unsigned UseOpc = Use->getOpcode();
34351 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
34352 UseOpc == X86ISD::VSRA) &&
34353 Use->getOperand(0) != Amt;
34354 });
34355
34356 APInt AmtUndef, AmtZero;
34357 unsigned NumAmtElts = AmtVT.getVectorNumElements();
34358 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
34359 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
34360 Depth + 1, AssumeSingleUse))
34361 return true;
34362 LLVM_FALLTHROUGH[[gnu::fallthrough]];
34363 }
34364 case X86ISD::VSHLI:
34365 case X86ISD::VSRLI:
34366 case X86ISD::VSRAI: {
34367 SDValue Src = Op.getOperand(0);
34368 APInt SrcUndef;
34369 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
34370 Depth + 1))
34371 return true;
34372 // TODO convert SrcUndef to KnownUndef.
34373 break;
34374 }
34375 case X86ISD::KSHIFTL: {
34376 SDValue Src = Op.getOperand(0);
34377 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
34378 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34378, __PRETTY_FUNCTION__))
;
34379 unsigned ShiftAmt = Amt->getZExtValue();
34380
34381 if (ShiftAmt == 0)
34382 return TLO.CombineTo(Op, Src);
34383
34384 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
34385 // single shift. We can do this if the bottom bits (which are shifted
34386 // out) are never demanded.
34387 if (Src.getOpcode() == X86ISD::KSHIFTR) {
34388 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
34389 unsigned C1 = Src.getConstantOperandVal(1);
34390 unsigned NewOpc = X86ISD::KSHIFTL;
34391 int Diff = ShiftAmt - C1;
34392 if (Diff < 0) {
34393 Diff = -Diff;
34394 NewOpc = X86ISD::KSHIFTR;
34395 }
34396
34397 SDLoc dl(Op);
34398 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
34399 return TLO.CombineTo(
34400 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
34401 }
34402 }
34403
34404 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
34405 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
34406 Depth + 1))
34407 return true;
34408
34409 KnownUndef <<= ShiftAmt;
34410 KnownZero <<= ShiftAmt;
34411 KnownZero.setLowBits(ShiftAmt);
34412 break;
34413 }
34414 case X86ISD::KSHIFTR: {
34415 SDValue Src = Op.getOperand(0);
34416 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
34417 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34417, __PRETTY_FUNCTION__))
;
34418 unsigned ShiftAmt = Amt->getZExtValue();
34419
34420 if (ShiftAmt == 0)
34421 return TLO.CombineTo(Op, Src);
34422
34423 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
34424 // single shift. We can do this if the top bits (which are shifted
34425 // out) are never demanded.
34426 if (Src.getOpcode() == X86ISD::KSHIFTL) {
34427 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
34428 unsigned C1 = Src.getConstantOperandVal(1);
34429 unsigned NewOpc = X86ISD::KSHIFTR;
34430 int Diff = ShiftAmt - C1;
34431 if (Diff < 0) {
34432 Diff = -Diff;
34433 NewOpc = X86ISD::KSHIFTL;
34434 }
34435
34436 SDLoc dl(Op);
34437 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
34438 return TLO.CombineTo(
34439 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
34440 }
34441 }
34442
34443 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
34444 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
34445 Depth + 1))
34446 return true;
34447
34448 KnownUndef.lshrInPlace(ShiftAmt);
34449 KnownZero.lshrInPlace(ShiftAmt);
34450 KnownZero.setHighBits(ShiftAmt);
34451 break;
34452 }
34453 case X86ISD::CVTSI2P:
34454 case X86ISD::CVTUI2P: {
34455 SDValue Src = Op.getOperand(0);
34456 MVT SrcVT = Src.getSimpleValueType();
34457 APInt SrcUndef, SrcZero;
34458 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
34459 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
34460 Depth + 1))
34461 return true;
34462 break;
34463 }
34464 case X86ISD::PACKSS:
34465 case X86ISD::PACKUS: {
34466 SDValue N0 = Op.getOperand(0);
34467 SDValue N1 = Op.getOperand(1);
34468
34469 APInt DemandedLHS, DemandedRHS;
34470 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34471
34472 APInt SrcUndef, SrcZero;
34473 if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
34474 Depth + 1))
34475 return true;
34476 if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
34477 Depth + 1))
34478 return true;
34479
34480 // Aggressively peek through ops to get at the demanded elts.
34481 // TODO - we should do this for all target/faux shuffles ops.
34482 if (!DemandedElts.isAllOnesValue()) {
34483 APInt DemandedSrcBits =
34484 APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
34485 SDValue NewN0 = SimplifyMultipleUseDemandedBits(
34486 N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
34487 SDValue NewN1 = SimplifyMultipleUseDemandedBits(
34488 N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
34489 if (NewN0 || NewN1) {
34490 NewN0 = NewN0 ? NewN0 : N0;
34491 NewN1 = NewN1 ? NewN1 : N1;
34492 return TLO.CombineTo(Op,
34493 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
34494 }
34495 }
34496 break;
34497 }
34498 case X86ISD::HADD:
34499 case X86ISD::HSUB:
34500 case X86ISD::FHADD:
34501 case X86ISD::FHSUB: {
34502 APInt DemandedLHS, DemandedRHS;
34503 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34504
34505 APInt LHSUndef, LHSZero;
34506 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
34507 LHSZero, TLO, Depth + 1))
34508 return true;
34509 APInt RHSUndef, RHSZero;
34510 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
34511 RHSZero, TLO, Depth + 1))
34512 return true;
34513 break;
34514 }
34515 case X86ISD::VTRUNC:
34516 case X86ISD::VTRUNCS:
34517 case X86ISD::VTRUNCUS: {
34518 SDValue Src = Op.getOperand(0);
34519 MVT SrcVT = Src.getSimpleValueType();
34520 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
34521 APInt SrcUndef, SrcZero;
34522 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
34523 Depth + 1))
34524 return true;
34525 KnownZero = SrcZero.zextOrTrunc(NumElts);
34526 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
34527 break;
34528 }
34529 case X86ISD::BLENDV: {
34530 APInt SelUndef, SelZero;
34531 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
34532 SelZero, TLO, Depth + 1))
34533 return true;
34534
34535 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
34536 APInt LHSUndef, LHSZero;
34537 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
34538 LHSZero, TLO, Depth + 1))
34539 return true;
34540
34541 APInt RHSUndef, RHSZero;
34542 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
34543 RHSZero, TLO, Depth + 1))
34544 return true;
34545
34546 KnownZero = LHSZero & RHSZero;
34547 KnownUndef = LHSUndef & RHSUndef;
34548 break;
34549 }
34550 case X86ISD::VBROADCAST: {
34551 SDValue Src = Op.getOperand(0);
34552 MVT SrcVT = Src.getSimpleValueType();
34553 if (!SrcVT.isVector())
34554 return false;
34555 // Don't bother broadcasting if we just need the 0'th element.
34556 if (DemandedElts == 1) {
34557 if (Src.getValueType() != VT)
34558 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
34559 SDLoc(Op));
34560 return TLO.CombineTo(Op, Src);
34561 }
34562 APInt SrcUndef, SrcZero;
34563 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
34564 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
34565 Depth + 1))
34566 return true;
34567 break;
34568 }
34569 case X86ISD::VPERMV: {
34570 SDValue Mask = Op.getOperand(0);
34571 APInt MaskUndef, MaskZero;
34572 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
34573 Depth + 1))
34574 return true;
34575 break;
34576 }
34577 case X86ISD::PSHUFB:
34578 case X86ISD::VPERMV3:
34579 case X86ISD::VPERMILPV: {
34580 SDValue Mask = Op.getOperand(1);
34581 APInt MaskUndef, MaskZero;
34582 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
34583 Depth + 1))
34584 return true;
34585 break;
34586 }
34587 case X86ISD::VPPERM:
34588 case X86ISD::VPERMIL2: {
34589 SDValue Mask = Op.getOperand(2);
34590 APInt MaskUndef, MaskZero;
34591 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
34592 Depth + 1))
34593 return true;
34594 break;
34595 }
34596 }
34597
34598 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
34599 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
34600 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
34601 if ((VT.is256BitVector() || VT.is512BitVector()) &&
34602 DemandedElts.lshr(NumElts / 2) == 0) {
34603 unsigned SizeInBits = VT.getSizeInBits();
34604 unsigned ExtSizeInBits = SizeInBits / 2;
34605
34606 // See if 512-bit ops only use the bottom 128-bits.
34607 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
34608 ExtSizeInBits = SizeInBits / 4;
34609
34610 switch (Opc) {
34611 // Zero upper elements.
34612 case X86ISD::VZEXT_MOVL: {
34613 SDLoc DL(Op);
34614 SDValue Ext0 =
34615 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
34616 SDValue ExtOp =
34617 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
34618 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
34619 SDValue Insert =
34620 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
34621 return TLO.CombineTo(Op, Insert);
34622 }
34623 // Subvector broadcast.
34624 case X86ISD::SUBV_BROADCAST: {
34625 SDLoc DL(Op);
34626 SDValue Src = Op.getOperand(0);
34627 if (Src.getValueSizeInBits() > ExtSizeInBits)
34628 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
34629 else if (Src.getValueSizeInBits() < ExtSizeInBits) {
34630 MVT SrcSVT = Src.getSimpleValueType().getScalarType();
34631 MVT SrcVT =
34632 MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
34633 Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
34634 }
34635 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
34636 TLO.DAG, DL, ExtSizeInBits));
34637 }
34638 // Byte shifts by immediate.
34639 case X86ISD::VSHLDQ:
34640 case X86ISD::VSRLDQ:
34641 // Shift by uniform.
34642 case X86ISD::VSHL:
34643 case X86ISD::VSRL:
34644 case X86ISD::VSRA:
34645 // Shift by immediate.
34646 case X86ISD::VSHLI:
34647 case X86ISD::VSRLI:
34648 case X86ISD::VSRAI: {
34649 SDLoc DL(Op);
34650 SDValue Ext0 =
34651 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
34652 SDValue ExtOp =
34653 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
34654 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
34655 SDValue Insert =
34656 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
34657 return TLO.CombineTo(Op, Insert);
34658 }
34659 case X86ISD::VPERMI: {
34660 // Simplify PERMPD/PERMQ to extract_subvector.
34661 // TODO: This should be done in shuffle combining.
34662 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
34663 SmallVector<int, 4> Mask;
34664 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
34665 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
34666 SDLoc DL(Op);
34667 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
34668 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
34669 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
34670 return TLO.CombineTo(Op, Insert);
34671 }
34672 }
34673 break;
34674 }
34675 // Target Shuffles.
34676 case X86ISD::PSHUFB:
34677 case X86ISD::UNPCKL:
34678 case X86ISD::UNPCKH:
34679 // Saturated Packs.
34680 case X86ISD::PACKSS:
34681 case X86ISD::PACKUS:
34682 // Horizontal Ops.
34683 case X86ISD::HADD:
34684 case X86ISD::HSUB:
34685 case X86ISD::FHADD:
34686 case X86ISD::FHSUB: {
34687 SDLoc DL(Op);
34688 MVT ExtVT = VT.getSimpleVT();
34689 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
34690 ExtSizeInBits / ExtVT.getScalarSizeInBits());
34691 SDValue Ext0 =
34692 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
34693 SDValue Ext1 =
34694 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
34695 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
34696 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
34697 SDValue Insert =
34698 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
34699 return TLO.CombineTo(Op, Insert);
34700 }
34701 }
34702 }
34703
34704 // Get target/faux shuffle mask.
34705 APInt OpUndef, OpZero;
34706 SmallVector<int, 64> OpMask;
34707 SmallVector<SDValue, 2> OpInputs;
34708 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
34709 OpZero, TLO.DAG, Depth, false))
34710 return false;
34711
34712 // Shuffle inputs must be the same size as the result.
34713 if (OpMask.size() != (unsigned)NumElts ||
34714 llvm::any_of(OpInputs, [VT](SDValue V) {
34715 return VT.getSizeInBits() != V.getValueSizeInBits() ||
34716 !V.getValueType().isVector();
34717 }))
34718 return false;
34719
34720 KnownZero = OpZero;
34721 KnownUndef = OpUndef;
34722
34723 // Check if shuffle mask can be simplified to undef/zero/identity.
34724 int NumSrcs = OpInputs.size();
34725 for (int i = 0; i != NumElts; ++i)
34726 if (!DemandedElts[i])
34727 OpMask[i] = SM_SentinelUndef;
34728
34729 if (isUndefInRange(OpMask, 0, NumElts)) {
34730 KnownUndef.setAllBits();
34731 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
34732 }
34733 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
34734 KnownZero.setAllBits();
34735 return TLO.CombineTo(
34736 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
34737 }
34738 for (int Src = 0; Src != NumSrcs; ++Src)
34739 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
34740 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
34741
34742 // Attempt to simplify inputs.
34743 for (int Src = 0; Src != NumSrcs; ++Src) {
34744 // TODO: Support inputs of different types.
34745 if (OpInputs[Src].getValueType() != VT)
34746 continue;
34747
34748 int Lo = Src * NumElts;
34749 APInt SrcElts = APInt::getNullValue(NumElts);
34750 for (int i = 0; i != NumElts; ++i)
34751 if (DemandedElts[i]) {
34752 int M = OpMask[i] - Lo;
34753 if (0 <= M && M < NumElts)
34754 SrcElts.setBit(M);
34755 }
34756
34757 // TODO - Propagate input undef/zero elts.
34758 APInt SrcUndef, SrcZero;
34759 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
34760 TLO, Depth + 1))
34761 return true;
34762 }
34763
34764 // If we don't demand all elements, then attempt to combine to a simpler
34765 // shuffle.
34766 // TODO: Handle other depths, but first we need to handle the fact that
34767 // it might combine to the same shuffle.
34768 if (!DemandedElts.isAllOnesValue() && Depth == 0) {
34769 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
34770 for (int i = 0; i != NumElts; ++i)
34771 if (DemandedElts[i])
34772 DemandedMask[i] = i;
34773
34774 SDValue NewShuffle = combineX86ShufflesRecursively(
34775 {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
34776 /*AllowVarMask*/ true, TLO.DAG, Subtarget);
34777 if (NewShuffle)
34778 return TLO.CombineTo(Op, NewShuffle);
34779 }
34780
34781 return false;
34782}
34783
34784bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
34785 SDValue Op, const APInt &OriginalDemandedBits,
34786 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
34787 unsigned Depth) const {
34788 EVT VT = Op.getValueType();
34789 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
34790 unsigned Opc = Op.getOpcode();
34791 switch(Opc) {
34792 case X86ISD::PMULDQ:
34793 case X86ISD::PMULUDQ: {
34794 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
34795 KnownBits KnownOp;
34796 SDValue LHS = Op.getOperand(0);
34797 SDValue RHS = Op.getOperand(1);
34798 // FIXME: Can we bound this better?
34799 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
34800 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
34801 TLO, Depth + 1))
34802 return true;
34803 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
34804 TLO, Depth + 1))
34805 return true;
34806
34807 // Aggressively peek through ops to get at the demanded low bits.
34808 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
34809 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
34810 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
34811 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
34812 if (DemandedLHS || DemandedRHS) {
34813 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
34814 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
34815 return TLO.CombineTo(
34816 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
34817 }
34818 break;
34819 }
34820 case X86ISD::VSHLI: {
34821 SDValue Op0 = Op.getOperand(0);
34822 SDValue Op1 = Op.getOperand(1);
34823
34824 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
34825 if (ShiftImm->getAPIntValue().uge(BitWidth))
34826 break;
34827
34828 unsigned ShAmt = ShiftImm->getZExtValue();
34829 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
34830
34831 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
34832 // single shift. We can do this if the bottom bits (which are shifted
34833 // out) are never demanded.
34834 if (Op0.getOpcode() == X86ISD::VSRLI &&
34835 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
34836 if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
34837 if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
34838 int Diff = ShAmt - Shift2Imm->getZExtValue();
34839 if (Diff == 0)
34840 return TLO.CombineTo(Op, Op0.getOperand(0));
34841
34842 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
34843 SDValue NewShift = TLO.DAG.getNode(
34844 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
34845 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
34846 return TLO.CombineTo(Op, NewShift);
34847 }
34848 }
34849 }
34850
34851 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
34852 TLO, Depth + 1))
34853 return true;
34854
34855 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34855, __PRETTY_FUNCTION__))
;
34856 Known.Zero <<= ShAmt;
34857 Known.One <<= ShAmt;
34858
34859 // Low bits known zero.
34860 Known.Zero.setLowBits(ShAmt);
34861 }
34862 break;
34863 }
34864 case X86ISD::VSRLI: {
34865 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34866 if (ShiftImm->getAPIntValue().uge(BitWidth))
34867 break;
34868
34869 unsigned ShAmt = ShiftImm->getZExtValue();
34870 APInt DemandedMask = OriginalDemandedBits << ShAmt;
34871
34872 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
34873 OriginalDemandedElts, Known, TLO, Depth + 1))
34874 return true;
34875
34876 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34876, __PRETTY_FUNCTION__))
;
34877 Known.Zero.lshrInPlace(ShAmt);
34878 Known.One.lshrInPlace(ShAmt);
34879
34880 // High bits known zero.
34881 Known.Zero.setHighBits(ShAmt);
34882 }
34883 break;
34884 }
34885 case X86ISD::VSRAI: {
34886 SDValue Op0 = Op.getOperand(0);
34887 SDValue Op1 = Op.getOperand(1);
34888
34889 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
34890 if (ShiftImm->getAPIntValue().uge(BitWidth))
34891 break;
34892
34893 unsigned ShAmt = ShiftImm->getZExtValue();
34894 APInt DemandedMask = OriginalDemandedBits << ShAmt;
34895
34896 // If we just want the sign bit then we don't need to shift it.
34897 if (OriginalDemandedBits.isSignMask())
34898 return TLO.CombineTo(Op, Op0);
34899
34900 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34901 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
34902 SDValue Op00 = Op0.getOperand(0);
34903 unsigned NumSignBits =
34904 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
34905 if (ShAmt < NumSignBits)
34906 return TLO.CombineTo(Op, Op00);
34907 }
34908
34909 // If any of the demanded bits are produced by the sign extension, we also
34910 // demand the input sign bit.
34911 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
34912 DemandedMask.setSignBit();
34913
34914 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
34915 TLO, Depth + 1))
34916 return true;
34917
34918 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34918, __PRETTY_FUNCTION__))
;
34919 Known.Zero.lshrInPlace(ShAmt);
34920 Known.One.lshrInPlace(ShAmt);
34921
34922 // If the input sign bit is known to be zero, or if none of the top bits
34923 // are demanded, turn this into an unsigned shift right.
34924 if (Known.Zero[BitWidth - ShAmt - 1] ||
34925 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
34926 return TLO.CombineTo(
34927 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
34928
34929 // High bits are known one.
34930 if (Known.One[BitWidth - ShAmt - 1])
34931 Known.One.setHighBits(ShAmt);
34932 }
34933 break;
34934 }
34935 case X86ISD::PEXTRB:
34936 case X86ISD::PEXTRW: {
34937 SDValue Vec = Op.getOperand(0);
34938 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34939 MVT VecVT = Vec.getSimpleValueType();
34940 unsigned NumVecElts = VecVT.getVectorNumElements();
34941
34942 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
34943 unsigned Idx = CIdx->getZExtValue();
34944 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
34945
34946 // If we demand no bits from the vector then we must have demanded
34947 // bits from the implict zext - simplify to zero.
34948 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
34949 if (DemandedVecBits == 0)
34950 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
34951
34952 APInt KnownUndef, KnownZero;
34953 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
34954 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
34955 KnownZero, TLO, Depth + 1))
34956 return true;
34957
34958 KnownBits KnownVec;
34959 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
34960 KnownVec, TLO, Depth + 1))
34961 return true;
34962
34963 if (SDValue V = SimplifyMultipleUseDemandedBits(
34964 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
34965 return TLO.CombineTo(
34966 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
34967
34968 Known = KnownVec.zext(BitWidth, true);
34969 return false;
34970 }
34971 break;
34972 }
34973 case X86ISD::PINSRB:
34974 case X86ISD::PINSRW: {
34975 SDValue Vec = Op.getOperand(0);
34976 SDValue Scl = Op.getOperand(1);
34977 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
34978 MVT VecVT = Vec.getSimpleValueType();
34979
34980 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
34981 unsigned Idx = CIdx->getZExtValue();
34982 if (!OriginalDemandedElts[Idx])
34983 return TLO.CombineTo(Op, Vec);
34984
34985 KnownBits KnownVec;
34986 APInt DemandedVecElts(OriginalDemandedElts);
34987 DemandedVecElts.clearBit(Idx);
34988 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
34989 KnownVec, TLO, Depth + 1))
34990 return true;
34991
34992 KnownBits KnownScl;
34993 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
34994 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
34995 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
34996 return true;
34997
34998 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
34999 Known.One = KnownVec.One & KnownScl.One;
35000 Known.Zero = KnownVec.Zero & KnownScl.Zero;
35001 return false;
35002 }
35003 break;
35004 }
35005 case X86ISD::PACKSS:
35006 // PACKSS saturates to MIN/MAX integer values. So if we just want the
35007 // sign bit then we can just ask for the source operands sign bit.
35008 // TODO - add known bits handling.
35009 if (OriginalDemandedBits.isSignMask()) {
35010 APInt DemandedLHS, DemandedRHS;
35011 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
35012
35013 KnownBits KnownLHS, KnownRHS;
35014 APInt SignMask = APInt::getSignMask(BitWidth * 2);
35015 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
35016 KnownLHS, TLO, Depth + 1))
35017 return true;
35018 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
35019 KnownRHS, TLO, Depth + 1))
35020 return true;
35021 }
35022 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
35023 break;
35024 case X86ISD::PCMPGT:
35025 // icmp sgt(0, R) == ashr(R, BitWidth-1).
35026 // iff we only need the sign bit then we can use R directly.
35027 if (OriginalDemandedBits.isSignMask() &&
35028 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
35029 return TLO.CombineTo(Op, Op.getOperand(1));
35030 break;
35031 case X86ISD::MOVMSK: {
35032 SDValue Src = Op.getOperand(0);
35033 MVT SrcVT = Src.getSimpleValueType();
35034 unsigned SrcBits = SrcVT.getScalarSizeInBits();
35035 unsigned NumElts = SrcVT.getVectorNumElements();
35036
35037 // If we don't need the sign bits at all just return zero.
35038 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
35039 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
35040
35041 // Only demand the vector elements of the sign bits we need.
35042 APInt KnownUndef, KnownZero;
35043 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
35044 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
35045 TLO, Depth + 1))
35046 return true;
35047
35048 Known.Zero = KnownZero.zextOrSelf(BitWidth);
35049 Known.Zero.setHighBits(BitWidth - NumElts);
35050
35051 // MOVMSK only uses the MSB from each vector element.
35052 KnownBits KnownSrc;
35053 if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
35054 KnownSrc, TLO, Depth + 1))
35055 return true;
35056
35057 if (KnownSrc.One[SrcBits - 1])
35058 Known.One.setLowBits(NumElts);
35059 else if (KnownSrc.Zero[SrcBits - 1])
35060 Known.Zero.setLowBits(NumElts);
35061 return false;
35062 }
35063 }
35064
35065 return TargetLowering::SimplifyDemandedBitsForTargetNode(
35066 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
35067}
35068
35069SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
35070 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
35071 SelectionDAG &DAG, unsigned Depth) const {
35072 int NumElts = DemandedElts.getBitWidth();
35073 unsigned Opc = Op.getOpcode();
35074 EVT VT = Op.getValueType();
35075
35076 switch (Opc) {
35077 case X86ISD::PINSRB:
35078 case X86ISD::PINSRW: {
35079 // If we don't demand the inserted element, return the base vector.
35080 SDValue Vec = Op.getOperand(0);
35081 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
35082 MVT VecVT = Vec.getSimpleValueType();
35083 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
35084 !DemandedElts[CIdx->getZExtValue()])
35085 return Vec;
35086 break;
35087 }
35088 }
35089
35090 APInt ShuffleUndef, ShuffleZero;
35091 SmallVector<int, 16> ShuffleMask;
35092 SmallVector<SDValue, 2> ShuffleOps;
35093 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
35094 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
35095 // If all the demanded elts are from one operand and are inline,
35096 // then we can use the operand directly.
35097 int NumOps = ShuffleOps.size();
35098 if (ShuffleMask.size() == (unsigned)NumElts &&
35099 llvm::all_of(ShuffleOps, [VT](SDValue V) {
35100 return VT.getSizeInBits() == V.getValueSizeInBits();
35101 })) {
35102
35103 if (DemandedElts.isSubsetOf(ShuffleUndef))
35104 return DAG.getUNDEF(VT);
35105 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
35106 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
35107
35108 // Bitmask that indicates which ops have only been accessed 'inline'.
35109 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
35110 for (int i = 0; i != NumElts; ++i) {
35111 int M = ShuffleMask[i];
35112 if (!DemandedElts[i] || ShuffleUndef[i])
35113 continue;
35114 int Op = M / NumElts;
35115 int Index = M % NumElts;
35116 if (M < 0 || Index != i) {
35117 IdentityOp.clearAllBits();
35118 break;
35119 }
35120 IdentityOp &= APInt::getOneBitSet(NumOps, Op);
35121 if (IdentityOp == 0)
35122 break;
35123 }
35124 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35125, __PRETTY_FUNCTION__))
35125 "Multiple identity shuffles detected")(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35125, __PRETTY_FUNCTION__))
;
35126
35127 if (IdentityOp != 0)
35128 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
35129 }
35130 }
35131
35132 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
35133 Op, DemandedBits, DemandedElts, DAG, Depth);
35134}
35135
35136/// Check if a vector extract from a target-specific shuffle of a load can be
35137/// folded into a single element load.
35138/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
35139/// shuffles have been custom lowered so we need to handle those here.
35140static SDValue
35141XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
35142 TargetLowering::DAGCombinerInfo &DCI) {
35143 if (DCI.isBeforeLegalizeOps())
35144 return SDValue();
35145
35146 SDValue InVec = N->getOperand(0);
35147 SDValue EltNo = N->getOperand(1);
35148 EVT EltVT = N->getValueType(0);
35149
35150 if (!isa<ConstantSDNode>(EltNo))
35151 return SDValue();
35152
35153 EVT OriginalVT = InVec.getValueType();
35154 unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
35155
35156 // Peek through bitcasts, don't duplicate a load with other uses.
35157 InVec = peekThroughOneUseBitcasts(InVec);
35158
35159 EVT CurrentVT = InVec.getValueType();
35160 if (!CurrentVT.isVector())
35161 return SDValue();
35162
35163 unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
35164 if ((NumOriginalElts % NumCurrentElts) != 0)
35165 return SDValue();
35166
35167 if (!isTargetShuffle(InVec.getOpcode()))
35168 return SDValue();
35169
35170 // Don't duplicate a load with other uses.
35171 if (!InVec.hasOneUse())
35172 return SDValue();
35173
35174 SmallVector<int, 16> ShuffleMask;
35175 SmallVector<SDValue, 2> ShuffleOps;
35176 bool UnaryShuffle;
35177 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
35178 ShuffleOps, ShuffleMask, UnaryShuffle))
35179 return SDValue();
35180
35181 unsigned Scale = NumOriginalElts / NumCurrentElts;
35182 if (Scale > 1) {
35183 SmallVector<int, 16> ScaledMask;
35184 scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
35185 ShuffleMask = std::move(ScaledMask);
35186 }
35187 assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch")((ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"
) ? static_cast<void> (0) : __assert_fail ("ShuffleMask.size() == NumOriginalElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35187, __PRETTY_FUNCTION__))
;
35188
35189 // Select the input vector, guarding against out of range extract vector.
35190 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
35191 int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
35192
35193 if (Idx == SM_SentinelZero)
35194 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
35195 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
35196 if (Idx == SM_SentinelUndef)
35197 return DAG.getUNDEF(EltVT);
35198
35199 // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
35200 // won't handle it.
35201 if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
35202 return SDValue();
35203
35204 assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&((0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= Idx && Idx < (int)(2 * NumOriginalElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35205, __PRETTY_FUNCTION__))
35205 "Shuffle index out of range")((0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= Idx && Idx < (int)(2 * NumOriginalElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35205, __PRETTY_FUNCTION__))
;
35206 SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
35207
35208 // If inputs to shuffle are the same for both ops, then allow 2 uses
35209 unsigned AllowedUses =
35210 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
35211
35212 if (LdNode.getOpcode() == ISD::BITCAST) {
35213 // Don't duplicate a load with other uses.
35214 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
35215 return SDValue();
35216
35217 AllowedUses = 1; // only allow 1 load use if we have a bitcast
35218 LdNode = LdNode.getOperand(0);
35219 }
35220
35221 if (!ISD::isNormalLoad(LdNode.getNode()))
35222 return SDValue();
35223
35224 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
35225
35226 if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
35227 return SDValue();
35228
35229 // If there's a bitcast before the shuffle, check if the load type and
35230 // alignment is valid.
35231 unsigned Align = LN0->getAlignment();
35232 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35233 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
35234 EltVT.getTypeForEVT(*DAG.getContext()));
35235
35236 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
35237 return SDValue();
35238
35239 // All checks match so transform back to vector_shuffle so that DAG combiner
35240 // can finish the job
35241 SDLoc dl(N);
35242
35243 // Create shuffle node taking into account the case that its a unary shuffle
35244 SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
35245 : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
35246 Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
35247 DAG.getBitcast(OriginalVT, ShuffleOps[0]),
35248 Shuffle, ShuffleMask);
35249 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
35250 EltNo);
35251}
35252
35253// Helper to peek through bitops/setcc to determine size of source vector.
35254// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
35255static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
35256 switch (Src.getOpcode()) {
35257 case ISD::SETCC:
35258 return Src.getOperand(0).getValueSizeInBits() == Size;
35259 case ISD::AND:
35260 case ISD::XOR:
35261 case ISD::OR:
35262 return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
35263 checkBitcastSrcVectorSize(Src.getOperand(1), Size);
35264 }
35265 return false;
35266}
35267
35268// Helper to push sign extension of vXi1 SETCC result through bitops.
35269static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
35270 SDValue Src, const SDLoc &DL) {
35271 switch (Src.getOpcode()) {
35272 case ISD::SETCC:
35273 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
35274 case ISD::AND:
35275 case ISD::XOR:
35276 case ISD::OR:
35277 return DAG.getNode(
35278 Src.getOpcode(), DL, SExtVT,
35279 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
35280 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
35281 }
35282 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35282)
;
35283}
35284
35285// Try to match patterns such as
35286// (i16 bitcast (v16i1 x))
35287// ->
35288// (i16 movmsk (16i8 sext (v16i1 x)))
35289// before the illegal vector is scalarized on subtargets that don't have legal
35290// vxi1 types.
35291static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
35292 const SDLoc &DL,
35293 const X86Subtarget &Subtarget) {
35294 EVT SrcVT = Src.getValueType();
35295 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
35296 return SDValue();
35297
35298 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
35299 // movmskb even with avx512. This will be better than truncating to vXi1 and
35300 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
35301 // vpcmpeqb/vpcmpgtb.
35302 bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
35303 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
35304 Src.getOperand(0).getValueType() == MVT::v32i8 ||
35305 Src.getOperand(0).getValueType() == MVT::v64i8);
35306
35307 // With AVX512 vxi1 types are legal and we prefer using k-regs.
35308 // MOVMSK is supported in SSE2 or later.
35309 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
35310 return SDValue();
35311
35312 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
35313 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
35314 // v8i16 and v16i16.
35315 // For these two cases, we can shuffle the upper element bytes to a
35316 // consecutive sequence at the start of the vector and treat the results as
35317 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
35318 // for v16i16 this is not the case, because the shuffle is expensive, so we
35319 // avoid sign-extending to this type entirely.
35320 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
35321 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
35322 MVT SExtVT;
35323 bool PropagateSExt = false;
35324 switch (SrcVT.getSimpleVT().SimpleTy) {
35325 default:
35326 return SDValue();
35327 case MVT::v2i1:
35328 SExtVT = MVT::v2i64;
35329 break;
35330 case MVT::v4i1:
35331 SExtVT = MVT::v4i32;
35332 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
35333 // sign-extend to a 256-bit operation to avoid truncation.
35334 if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
35335 SExtVT = MVT::v4i64;
35336 PropagateSExt = true;
35337 }
35338 break;
35339 case MVT::v8i1:
35340 SExtVT = MVT::v8i16;
35341 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
35342 // sign-extend to a 256-bit operation to match the compare.
35343 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
35344 // 256-bit because the shuffle is cheaper than sign extending the result of
35345 // the compare.
35346 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
35347 checkBitcastSrcVectorSize(Src, 512))) {
35348 SExtVT = MVT::v8i32;
35349 PropagateSExt = true;
35350 }
35351 break;
35352 case MVT::v16i1:
35353 SExtVT = MVT::v16i8;
35354 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
35355 // it is not profitable to sign-extend to 256-bit because this will
35356 // require an extra cross-lane shuffle which is more expensive than
35357 // truncating the result of the compare to 128-bits.
35358 break;
35359 case MVT::v32i1:
35360 SExtVT = MVT::v32i8;
35361 break;
35362 case MVT::v64i1:
35363 // If we have AVX512F, but not AVX512BW and the input is truncated from
35364 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
35365 if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
35366 SExtVT = MVT::v64i8;
35367 break;
35368 }
35369 return SDValue();
35370 };
35371
35372 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
35373 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
35374
35375 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
35376 V = getPMOVMSKB(DL, V, DAG, Subtarget);
35377 } else {
35378 if (SExtVT == MVT::v8i16)
35379 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
35380 DAG.getUNDEF(MVT::v8i16));
35381 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
35382 }
35383
35384 EVT IntVT =
35385 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
35386 V = DAG.getZExtOrTrunc(V, DL, IntVT);
35387 return DAG.getBitcast(VT, V);
35388}
35389
35390// Convert a vXi1 constant build vector to the same width scalar integer.
35391static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
35392 EVT SrcVT = Op.getValueType();
35393 assert(SrcVT.getVectorElementType() == MVT::i1 &&((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35394, __PRETTY_FUNCTION__))
35394 "Expected a vXi1 vector")((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35394, __PRETTY_FUNCTION__))
;
35395 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35396, __PRETTY_FUNCTION__))
35396 "Expected a constant build vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35396, __PRETTY_FUNCTION__))
;
35397
35398 APInt Imm(SrcVT.getVectorNumElements(), 0);
35399 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
35400 SDValue In = Op.getOperand(Idx);
35401 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
35402 Imm.setBit(Idx);
35403 }
35404 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
35405 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
35406}
35407
35408static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
35409 TargetLowering::DAGCombinerInfo &DCI,
35410 const X86Subtarget &Subtarget) {
35411 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((N->getOpcode() == ISD::BITCAST && "Expected a bitcast"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35411, __PRETTY_FUNCTION__))
;
35412
35413 if (!DCI.isBeforeLegalizeOps())
35414 return SDValue();
35415
35416 // Only do this if we have k-registers.
35417 if (!Subtarget.hasAVX512())
35418 return SDValue();
35419
35420 EVT DstVT = N->getValueType(0);
35421 SDValue Op = N->getOperand(0);
35422 EVT SrcVT = Op.getValueType();
35423
35424 if (!Op.hasOneUse())
35425 return SDValue();
35426
35427 // Look for logic ops.
35428 if (Op.getOpcode() != ISD::AND &&
35429 Op.getOpcode() != ISD::OR &&
35430 Op.getOpcode() != ISD::XOR)
35431 return SDValue();
35432
35433 // Make sure we have a bitcast between mask registers and a scalar type.
35434 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
35435 DstVT.isScalarInteger()) &&
35436 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
35437 SrcVT.isScalarInteger()))
35438 return SDValue();
35439
35440 SDValue LHS = Op.getOperand(0);
35441 SDValue RHS = Op.getOperand(1);
35442
35443 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
35444 LHS.getOperand(0).getValueType() == DstVT)
35445 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
35446 DAG.getBitcast(DstVT, RHS));
35447
35448 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
35449 RHS.getOperand(0).getValueType() == DstVT)
35450 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
35451 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
35452
35453 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
35454 // Most of these have to move a constant from the scalar domain anyway.
35455 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
35456 RHS = combinevXi1ConstantToInteger(RHS, DAG);
35457 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
35458 DAG.getBitcast(DstVT, LHS), RHS);
35459 }
35460
35461 return SDValue();
35462}
35463
35464static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
35465 const X86Subtarget &Subtarget) {
35466 SDLoc DL(BV);
35467 unsigned NumElts = BV->getNumOperands();
35468 SDValue Splat = BV->getSplatValue();
35469
35470 // Build MMX element from integer GPR or SSE float values.
35471 auto CreateMMXElement = [&](SDValue V) {
35472 if (V.isUndef())
35473 return DAG.getUNDEF(MVT::x86mmx);
35474 if (V.getValueType().isFloatingPoint()) {
35475 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
35476 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
35477 V = DAG.getBitcast(MVT::v2i64, V);
35478 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
35479 }
35480 V = DAG.getBitcast(MVT::i32, V);
35481 } else {
35482 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
35483 }
35484 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
35485 };
35486
35487 // Convert build vector ops to MMX data in the bottom elements.
35488 SmallVector<SDValue, 8> Ops;
35489
35490 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
35491 if (Splat) {
35492 if (Splat.isUndef())
35493 return DAG.getUNDEF(MVT::x86mmx);
35494
35495 Splat = CreateMMXElement(Splat);
35496
35497 if (Subtarget.hasSSE1()) {
35498 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
35499 if (NumElts == 8)
35500 Splat = DAG.getNode(
35501 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
35502 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
35503 Splat);
35504
35505 // Use PSHUFW to repeat 16-bit elements.
35506 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
35507 return DAG.getNode(
35508 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
35509 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
35510 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
35511 }
35512 Ops.append(NumElts, Splat);
35513 } else {
35514 for (unsigned i = 0; i != NumElts; ++i)
35515 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
35516 }
35517
35518 // Use tree of PUNPCKLs to build up general MMX vector.
35519 while (Ops.size() > 1) {
35520 unsigned NumOps = Ops.size();
35521 unsigned IntrinOp =
35522 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
35523 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
35524 : Intrinsic::x86_mmx_punpcklbw));
35525 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
35526 for (unsigned i = 0; i != NumOps; i += 2)
35527 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
35528 Ops[i], Ops[i + 1]);
35529 Ops.resize(NumOps / 2);
35530 }
35531
35532 return Ops[0];
35533}
35534
35535static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
35536 TargetLowering::DAGCombinerInfo &DCI,
35537 const X86Subtarget &Subtarget) {
35538 SDValue N0 = N->getOperand(0);
35539 EVT VT = N->getValueType(0);
35540 EVT SrcVT = N0.getValueType();
35541
35542 // Try to match patterns such as
35543 // (i16 bitcast (v16i1 x))
35544 // ->
35545 // (i16 movmsk (16i8 sext (v16i1 x)))
35546 // before the setcc result is scalarized on subtargets that don't have legal
35547 // vxi1 types.
35548 if (DCI.isBeforeLegalize()) {
35549 SDLoc dl(N);
35550 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
35551 return V;
35552
35553 // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
35554 // legalization destroys the v4i32 type.
35555 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
35556 VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
35557 N0.getOperand(0).getValueType() == MVT::v4i32 &&
35558 ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
35559 cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
35560 SDValue N00 = N0.getOperand(0);
35561 // Only do this if we can avoid scalarizing the input.
35562 if (ISD::isNormalLoad(N00.getNode()) ||
35563 (N00.getOpcode() == ISD::BITCAST &&
35564 N00.getOperand(0).getValueType() == MVT::v4f32)) {
35565 SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
35566 DAG.getBitcast(MVT::v4f32, N00));
35567 return DAG.getZExtOrTrunc(V, dl, VT);
35568 }
35569 }
35570
35571 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
35572 // type, widen both sides to avoid a trip through memory.
35573 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
35574 Subtarget.hasAVX512()) {
35575 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
35576 N0 = DAG.getBitcast(MVT::v8i1, N0);
35577 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
35578 DAG.getIntPtrConstant(0, dl));
35579 }
35580
35581 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
35582 // type, widen both sides to avoid a trip through memory.
35583 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
35584 Subtarget.hasAVX512()) {
35585 // Use zeros for the widening if we already have some zeroes. This can
35586 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
35587 // stream of this.
35588 // FIXME: It might make sense to detect a concat_vectors with a mix of
35589 // zeroes and undef and turn it into insert_subvector for i1 vectors as
35590 // a separate combine. What we can't do is canonicalize the operands of
35591 // such a concat or we'll get into a loop with SimplifyDemandedBits.
35592 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
35593 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
35594 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
35595 SrcVT = LastOp.getValueType();
35596 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
35597 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
35598 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
35599 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
35600 N0 = DAG.getBitcast(MVT::i8, N0);
35601 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
35602 }
35603 }
35604
35605 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
35606 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
35607 Ops[0] = N0;
35608 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
35609 N0 = DAG.getBitcast(MVT::i8, N0);
35610 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
35611 }
35612 }
35613
35614 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
35615 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
35616 // due to insert_subvector legalization on KNL. By promoting the copy to i16
35617 // we can help with known bits propagation from the vXi1 domain to the
35618 // scalar domain.
35619 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
35620 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35621 N0.getOperand(0).getValueType() == MVT::v16i1 &&
35622 isNullConstant(N0.getOperand(1)))
35623 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
35624 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
35625
35626 // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
35627 // determines // the number of bits loaded. Remaining bits are zero.
35628 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
35629 VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
35630 auto *BCast = cast<MemIntrinsicSDNode>(N0);
35631 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35632 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
35633 SDValue ResNode =
35634 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
35635 VT.getVectorElementType(),
35636 BCast->getMemOperand());
35637 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
35638 return ResNode;
35639 }
35640
35641 // Since MMX types are special and don't usually play with other vector types,
35642 // it's better to handle them early to be sure we emit efficient code by
35643 // avoiding store-load conversions.
35644 if (VT == MVT::x86mmx) {
35645 // Detect MMX constant vectors.
35646 APInt UndefElts;
35647 SmallVector<APInt, 1> EltBits;
35648 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
35649 SDLoc DL(N0);
35650 // Handle zero-extension of i32 with MOVD.
35651 if (EltBits[0].countLeadingZeros() >= 32)
35652 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
35653 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
35654 // Else, bitcast to a double.
35655 // TODO - investigate supporting sext 32-bit immediates on x86_64.
35656 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
35657 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
35658 }
35659
35660 // Detect bitcasts to x86mmx low word.
35661 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
35662 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
35663 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
35664 bool LowUndef = true, AllUndefOrZero = true;
35665 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
35666 SDValue Op = N0.getOperand(i);
35667 LowUndef &= Op.isUndef() || (i >= e/2);
35668 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
35669 }
35670 if (AllUndefOrZero) {
35671 SDValue N00 = N0.getOperand(0);
35672 SDLoc dl(N00);
35673 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
35674 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
35675 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
35676 }
35677 }
35678
35679 // Detect bitcasts of 64-bit build vectors and convert to a
35680 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
35681 // lowest element.
35682 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
35683 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
35684 SrcVT == MVT::v8i8))
35685 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
35686
35687 // Detect bitcasts between element or subvector extraction to x86mmx.
35688 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
35689 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
35690 isNullConstant(N0.getOperand(1))) {
35691 SDValue N00 = N0.getOperand(0);
35692 if (N00.getValueType().is128BitVector())
35693 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
35694 DAG.getBitcast(MVT::v2i64, N00));
35695 }
35696
35697 // Detect bitcasts from FP_TO_SINT to x86mmx.
35698 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
35699 SDLoc DL(N0);
35700 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
35701 DAG.getUNDEF(MVT::v2i32));
35702 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
35703 DAG.getBitcast(MVT::v2i64, Res));
35704 }
35705 }
35706
35707 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
35708 // most of these to scalar anyway.
35709 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
35710 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
35711 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
35712 return combinevXi1ConstantToInteger(N0, DAG);
35713 }
35714
35715 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
35716 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
35717 isa<ConstantSDNode>(N0)) {
35718 auto *C = cast<ConstantSDNode>(N0);
35719 if (C->isAllOnesValue())
35720 return DAG.getConstant(1, SDLoc(N0), VT);
35721 if (C->isNullValue())
35722 return DAG.getConstant(0, SDLoc(N0), VT);
35723 }
35724
35725 // Try to remove bitcasts from input and output of mask arithmetic to
35726 // remove GPR<->K-register crossings.
35727 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
35728 return V;
35729
35730 // Convert a bitcasted integer logic operation that has one bitcasted
35731 // floating-point operand into a floating-point logic operation. This may
35732 // create a load of a constant, but that is cheaper than materializing the
35733 // constant in an integer register and transferring it to an SSE register or
35734 // transferring the SSE operand to integer register and back.
35735 unsigned FPOpcode;
35736 switch (N0.getOpcode()) {
35737 case ISD::AND: FPOpcode = X86ISD::FAND; break;
35738 case ISD::OR: FPOpcode = X86ISD::FOR; break;
35739 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
35740 default: return SDValue();
35741 }
35742
35743 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
35744 (Subtarget.hasSSE2() && VT == MVT::f64)))
35745 return SDValue();
35746
35747 SDValue LogicOp0 = N0.getOperand(0);
35748 SDValue LogicOp1 = N0.getOperand(1);
35749 SDLoc DL0(N0);
35750
35751 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
35752 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
35753 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
35754 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
35755 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
35756 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
35757 }
35758 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
35759 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
35760 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
35761 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
35762 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
35763 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
35764 }
35765
35766 return SDValue();
35767}
35768
35769// Given a ABS node, detect the following pattern:
35770// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
35771// This is useful as it is the input into a SAD pattern.
35772static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
35773 SDValue AbsOp1 = Abs->getOperand(0);
35774 if (AbsOp1.getOpcode() != ISD::SUB)
35775 return false;
35776
35777 Op0 = AbsOp1.getOperand(0);
35778 Op1 = AbsOp1.getOperand(1);
35779
35780 // Check if the operands of the sub are zero-extended from vectors of i8.
35781 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
35782 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
35783 Op1.getOpcode() != ISD::ZERO_EXTEND ||
35784 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
35785 return false;
35786
35787 return true;
35788}
35789
35790// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
35791// to these zexts.
35792static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
35793 const SDValue &Zext1, const SDLoc &DL,
35794 const X86Subtarget &Subtarget) {
35795 // Find the appropriate width for the PSADBW.
35796 EVT InVT = Zext0.getOperand(0).getValueType();
35797 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
35798
35799 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
35800 // fill in the missing vector elements with 0.
35801 unsigned NumConcat = RegSize / InVT.getSizeInBits();
35802 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
35803 Ops[0] = Zext0.getOperand(0);
35804 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
35805 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
35806 Ops[0] = Zext1.getOperand(0);
35807 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
35808
35809 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
35810 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35811 ArrayRef<SDValue> Ops) {
35812 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
35813 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
35814 };
35815 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
35816 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
35817 PSADBWBuilder);
35818}
35819
35820// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
35821// PHMINPOSUW.
35822static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
35823 const X86Subtarget &Subtarget) {
35824 // Bail without SSE41.
35825 if (!Subtarget.hasSSE41())
35826 return SDValue();
35827
35828 EVT ExtractVT = Extract->getValueType(0);
35829 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
35830 return SDValue();
35831
35832 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
35833 ISD::NodeType BinOp;
35834 SDValue Src = DAG.matchBinOpReduction(
35835 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
35836 if (!Src)
35837 return SDValue();
35838
35839 EVT SrcVT = Src.getValueType();
35840 EVT SrcSVT = SrcVT.getScalarType();
35841 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
35842 return SDValue();
35843
35844 SDLoc DL(Extract);
35845 SDValue MinPos = Src;
35846
35847 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
35848 while (SrcVT.getSizeInBits() > 128) {
35849 unsigned NumElts = SrcVT.getVectorNumElements();
35850 unsigned NumSubElts = NumElts / 2;
35851 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
35852 unsigned SubSizeInBits = SrcVT.getSizeInBits();
35853 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
35854 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
35855 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
35856 }
35857 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35859, __PRETTY_FUNCTION__))
35858 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35859, __PRETTY_FUNCTION__))
35859 "Unexpected value type")((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35859, __PRETTY_FUNCTION__))
;
35860
35861 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
35862 // to flip the value accordingly.
35863 SDValue Mask;
35864 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
35865 if (BinOp == ISD::SMAX)
35866 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
35867 else if (BinOp == ISD::SMIN)
35868 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
35869 else if (BinOp == ISD::UMAX)
35870 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
35871
35872 if (Mask)
35873 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
35874
35875 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
35876 // shuffling each upper element down and insert zeros. This means that the
35877 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
35878 // ready for the PHMINPOS.
35879 if (ExtractVT == MVT::i8) {
35880 SDValue Upper = DAG.getVectorShuffle(
35881 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
35882 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
35883 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
35884 }
35885
35886 // Perform the PHMINPOS on a v8i16 vector,
35887 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
35888 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
35889 MinPos = DAG.getBitcast(SrcVT, MinPos);
35890
35891 if (Mask)
35892 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
35893
35894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
35895 DAG.getIntPtrConstant(0, DL));
35896}
35897
35898// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
35899static SDValue combineHorizontalPredicateResult(SDNode *Extract,
35900 SelectionDAG &DAG,
35901 const X86Subtarget &Subtarget) {
35902 // Bail without SSE2.
35903 if (!Subtarget.hasSSE2())
35904 return SDValue();
35905
35906 EVT ExtractVT = Extract->getValueType(0);
35907 unsigned BitWidth = ExtractVT.getSizeInBits();
35908 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
35909 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
35910 return SDValue();
35911
35912 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
35913 ISD::NodeType BinOp;
35914 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
35915 if (!Match && ExtractVT == MVT::i1)
35916 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
35917 if (!Match)
35918 return SDValue();
35919
35920 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
35921 // which we can't support here for now.
35922 if (Match.getScalarValueSizeInBits() != BitWidth)
35923 return SDValue();
35924
35925 SDValue Movmsk;
35926 SDLoc DL(Extract);
35927 EVT MatchVT = Match.getValueType();
35928 unsigned NumElts = MatchVT.getVectorNumElements();
35929 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
35930 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35931
35932 if (ExtractVT == MVT::i1) {
35933 // Special case for (pre-legalization) vXi1 reductions.
35934 if (NumElts > 64 || !isPowerOf2_32(NumElts))
35935 return SDValue();
35936 if (TLI.isTypeLegal(MatchVT)) {
35937 // If this is a legal AVX512 predicate type then we can just bitcast.
35938 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
35939 Movmsk = DAG.getBitcast(MovmskVT, Match);
35940 } else {
35941 // Use combineBitcastvxi1 to create the MOVMSK.
35942 while (NumElts > MaxElts) {
35943 SDValue Lo, Hi;
35944 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
35945 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
35946 NumElts /= 2;
35947 }
35948 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
35949 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
35950 }
35951 if (!Movmsk)
35952 return SDValue();
35953 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
35954 } else {
35955 // Bail with AVX512VL (which uses predicate registers).
35956 if (Subtarget.hasVLX())
35957 return SDValue();
35958
35959 unsigned MatchSizeInBits = Match.getValueSizeInBits();
35960 if (!(MatchSizeInBits == 128 ||
35961 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
35962 return SDValue();
35963
35964 // Make sure this isn't a vector of 1 element. The perf win from using
35965 // MOVMSK diminishes with less elements in the reduction, but it is
35966 // generally better to get the comparison over to the GPRs as soon as
35967 // possible to reduce the number of vector ops.
35968 if (Match.getValueType().getVectorNumElements() < 2)
35969 return SDValue();
35970
35971 // Check that we are extracting a reduction of all sign bits.
35972 if (DAG.ComputeNumSignBits(Match) != BitWidth)
35973 return SDValue();
35974
35975 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
35976 SDValue Lo, Hi;
35977 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
35978 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
35979 MatchSizeInBits = Match.getValueSizeInBits();
35980 }
35981
35982 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
35983 MVT MaskSrcVT;
35984 if (64 == BitWidth || 32 == BitWidth)
35985 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
35986 MatchSizeInBits / BitWidth);
35987 else
35988 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
35989
35990 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
35991 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
35992 NumElts = MaskSrcVT.getVectorNumElements();
35993 }
35994 assert((NumElts <= 32 || NumElts == 64) &&(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35995, __PRETTY_FUNCTION__))
35995 "Not expecting more than 64 elements")(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35995, __PRETTY_FUNCTION__))
;
35996
35997 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
35998 if (BinOp == ISD::XOR) {
35999 // parity -> (AND (CTPOP(MOVMSK X)), 1)
36000 SDValue Mask = DAG.getConstant(1, DL, CmpVT);
36001 SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
36002 Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
36003 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
36004 }
36005
36006 SDValue CmpC;
36007 ISD::CondCode CondCode;
36008 if (BinOp == ISD::OR) {
36009 // any_of -> MOVMSK != 0
36010 CmpC = DAG.getConstant(0, DL, CmpVT);
36011 CondCode = ISD::CondCode::SETNE;
36012 } else {
36013 // all_of -> MOVMSK == ((1 << NumElts) - 1)
36014 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
36015 DL, CmpVT);
36016 CondCode = ISD::CondCode::SETEQ;
36017 }
36018
36019 // The setcc produces an i8 of 0/1, so extend that to the result width and
36020 // negate to get the final 0/-1 mask value.
36021 EVT SetccVT =
36022 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
36023 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
36024 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
36025 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
36026 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
36027}
36028
36029static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
36030 const X86Subtarget &Subtarget) {
36031 // PSADBW is only supported on SSE2 and up.
36032 if (!Subtarget.hasSSE2())
36033 return SDValue();
36034
36035 // Verify the type we're extracting from is any integer type above i16.
36036 EVT VT = Extract->getOperand(0).getValueType();
36037 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
36038 return SDValue();
36039
36040 unsigned RegSize = 128;
36041 if (Subtarget.useBWIRegs())
36042 RegSize = 512;
36043 else if (Subtarget.hasAVX())
36044 RegSize = 256;
36045
36046 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
36047 // TODO: We should be able to handle larger vectors by splitting them before
36048 // feeding them into several SADs, and then reducing over those.
36049 if (RegSize / VT.getVectorNumElements() < 8)
36050 return SDValue();
36051
36052 // Match shuffle + add pyramid.
36053 ISD::NodeType BinOp;
36054 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
36055
36056 // The operand is expected to be zero extended from i8
36057 // (verified in detectZextAbsDiff).
36058 // In order to convert to i64 and above, additional any/zero/sign
36059 // extend is expected.
36060 // The zero extend from 32 bit has no mathematical effect on the result.
36061 // Also the sign extend is basically zero extend
36062 // (extends the sign bit which is zero).
36063 // So it is correct to skip the sign/zero extend instruction.
36064 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
36065 Root.getOpcode() == ISD::ZERO_EXTEND ||
36066 Root.getOpcode() == ISD::ANY_EXTEND))
36067 Root = Root.getOperand(0);
36068
36069 // If there was a match, we want Root to be a select that is the root of an
36070 // abs-diff pattern.
36071 if (!Root || Root.getOpcode() != ISD::ABS)
36072 return SDValue();
36073
36074 // Check whether we have an abs-diff pattern feeding into the select.
36075 SDValue Zext0, Zext1;
36076 if (!detectZextAbsDiff(Root, Zext0, Zext1))
36077 return SDValue();
36078
36079 // Create the SAD instruction.
36080 SDLoc DL(Extract);
36081 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
36082
36083 // If the original vector was wider than 8 elements, sum over the results
36084 // in the SAD vector.
36085 unsigned Stages = Log2_32(VT.getVectorNumElements());
36086 MVT SadVT = SAD.getSimpleValueType();
36087 if (Stages > 3) {
36088 unsigned SadElems = SadVT.getVectorNumElements();
36089
36090 for(unsigned i = Stages - 3; i > 0; --i) {
36091 SmallVector<int, 16> Mask(SadElems, -1);
36092 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
36093 Mask[j] = MaskEnd + j;
36094
36095 SDValue Shuffle =
36096 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
36097 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
36098 }
36099 }
36100
36101 MVT Type = Extract->getSimpleValueType(0);
36102 unsigned TypeSizeInBits = Type.getSizeInBits();
36103 // Return the lowest TypeSizeInBits bits.
36104 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
36105 SAD = DAG.getBitcast(ResVT, SAD);
36106 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
36107 Extract->getOperand(1));
36108}
36109
36110// Attempt to peek through a target shuffle and extract the scalar from the
36111// source.
36112static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
36113 TargetLowering::DAGCombinerInfo &DCI,
36114 const X86Subtarget &Subtarget) {
36115 if (DCI.isBeforeLegalizeOps())
36116 return SDValue();
36117
36118 SDLoc dl(N);
36119 SDValue Src = N->getOperand(0);
36120 SDValue Idx = N->getOperand(1);
36121
36122 EVT VT = N->getValueType(0);
36123 EVT SrcVT = Src.getValueType();
36124 EVT SrcSVT = SrcVT.getVectorElementType();
36125 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36126
36127 // Don't attempt this for boolean mask vectors or unknown extraction indices.
36128 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
36129 return SDValue();
36130
36131 SDValue SrcBC = peekThroughBitcasts(Src);
36132
36133 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
36134 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
36135 SDValue SrcOp = SrcBC.getOperand(0);
36136 if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
36137 return DAG.getBitcast(VT, SrcOp);
36138 }
36139
36140 // If we're extracting a single element from a broadcast load and there are
36141 // no other users, just create a single load.
36142 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
36143 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
36144 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
36145 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
36146 VT.getSizeInBits() == SrcBCWidth) {
36147 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
36148 MemIntr->getBasePtr(),
36149 MemIntr->getPointerInfo(),
36150 MemIntr->getAlignment(),
36151 MemIntr->getMemOperand()->getFlags());
36152 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
36153 return Load;
36154 }
36155 }
36156
36157 // Handle extract(truncate(x)) for 0'th index.
36158 // TODO: Treat this as a faux shuffle?
36159 // TODO: When can we use this for general indices?
36160 if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
36161 isNullConstant(Idx)) {
36162 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
36163 Src = DAG.getBitcast(SrcVT, Src);
36164 return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
36165 }
36166
36167 // Resolve the target shuffle inputs and mask.
36168 SmallVector<int, 16> Mask;
36169 SmallVector<SDValue, 2> Ops;
36170 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
36171 return SDValue();
36172
36173 // Attempt to narrow/widen the shuffle mask to the correct size.
36174 if (Mask.size() != NumSrcElts) {
36175 if ((NumSrcElts % Mask.size()) == 0) {
36176 SmallVector<int, 16> ScaledMask;
36177 int Scale = NumSrcElts / Mask.size();
36178 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
36179 Mask = std::move(ScaledMask);
36180 } else if ((Mask.size() % NumSrcElts) == 0) {
36181 // Simplify Mask based on demanded element.
36182 int ExtractIdx = (int)N->getConstantOperandVal(1);
36183 int Scale = Mask.size() / NumSrcElts;
36184 int Lo = Scale * ExtractIdx;
36185 int Hi = Scale * (ExtractIdx + 1);
36186 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
36187 if (i < Lo || Hi <= i)
36188 Mask[i] = SM_SentinelUndef;
36189
36190 SmallVector<int, 16> WidenedMask;
36191 while (Mask.size() > NumSrcElts &&
36192 canWidenShuffleElements(Mask, WidenedMask))
36193 Mask = std::move(WidenedMask);
36194 // TODO - investigate support for wider shuffle masks with known upper
36195 // undef/zero elements for implicit zero-extension.
36196 }
36197 }
36198
36199 // Check if narrowing/widening failed.
36200 if (Mask.size() != NumSrcElts)
36201 return SDValue();
36202
36203 int SrcIdx = Mask[N->getConstantOperandVal(1)];
36204
36205 // If the shuffle source element is undef/zero then we can just accept it.
36206 if (SrcIdx == SM_SentinelUndef)
36207 return DAG.getUNDEF(VT);
36208
36209 if (SrcIdx == SM_SentinelZero)
36210 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
36211 : DAG.getConstant(0, dl, VT);
36212
36213 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
36214 SrcIdx = SrcIdx % Mask.size();
36215
36216 // We can only extract other elements from 128-bit vectors and in certain
36217 // circumstances, depending on SSE-level.
36218 // TODO: Investigate using extract_subvector for larger vectors.
36219 // TODO: Investigate float/double extraction if it will be just stored.
36220 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
36221 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
36222 assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast
<void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36222, __PRETTY_FUNCTION__))
;
36223 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
36224 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
36225 DAG.getIntPtrConstant(SrcIdx, dl));
36226 }
36227
36228 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
36229 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
36230 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36231, __PRETTY_FUNCTION__))
36231 "Unexpected extraction type")((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36231, __PRETTY_FUNCTION__))
;
36232 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
36233 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
36234 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
36235 DAG.getIntPtrConstant(SrcIdx, dl));
36236 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
36237 }
36238
36239 return SDValue();
36240}
36241
36242/// Extracting a scalar FP value from vector element 0 is free, so extract each
36243/// operand first, then perform the math as a scalar op.
36244static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
36245 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Expected extract") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36245, __PRETTY_FUNCTION__))
;
36246 SDValue Vec = ExtElt->getOperand(0);
36247 SDValue Index = ExtElt->getOperand(1);
36248 EVT VT = ExtElt->getValueType(0);
36249 EVT VecVT = Vec.getValueType();
36250
36251 // TODO: If this is a unary/expensive/expand op, allow extraction from a
36252 // non-zero element because the shuffle+scalar op will be cheaper?
36253 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
36254 return SDValue();
36255
36256 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
36257 // extract, the condition code), so deal with those as a special-case.
36258 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
36259 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
36260 if (OpVT != MVT::f32 && OpVT != MVT::f64)
36261 return SDValue();
36262
36263 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
36264 SDLoc DL(ExtElt);
36265 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
36266 Vec.getOperand(0), Index);
36267 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
36268 Vec.getOperand(1), Index);
36269 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
36270 }
36271
36272 if (VT != MVT::f32 && VT != MVT::f64)
36273 return SDValue();
36274
36275 // Vector FP selects don't fit the pattern of FP math ops (because the
36276 // condition has a different type and we have to change the opcode), so deal
36277 // with those here.
36278 // FIXME: This is restricted to pre type legalization by ensuring the setcc
36279 // has i1 elements. If we loosen this we need to convert vector bool to a
36280 // scalar bool.
36281 if (Vec.getOpcode() == ISD::VSELECT &&
36282 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
36283 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
36284 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
36285 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
36286 SDLoc DL(ExtElt);
36287 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
36288 Vec.getOperand(0).getValueType().getScalarType(),
36289 Vec.getOperand(0), Index);
36290 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
36291 Vec.getOperand(1), Index);
36292 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
36293 Vec.getOperand(2), Index);
36294 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
36295 }
36296
36297 // TODO: This switch could include FNEG and the x86-specific FP logic ops
36298 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
36299 // missed load folding and fma+fneg combining.
36300 switch (Vec.getOpcode()) {
36301 case ISD::FMA: // Begin 3 operands
36302 case ISD::FMAD:
36303 case ISD::FADD: // Begin 2 operands
36304 case ISD::FSUB:
36305 case ISD::FMUL:
36306 case ISD::FDIV:
36307 case ISD::FREM:
36308 case ISD::FCOPYSIGN:
36309 case ISD::FMINNUM:
36310 case ISD::FMAXNUM:
36311 case ISD::FMINNUM_IEEE:
36312 case ISD::FMAXNUM_IEEE:
36313 case ISD::FMAXIMUM:
36314 case ISD::FMINIMUM:
36315 case X86ISD::FMAX:
36316 case X86ISD::FMIN:
36317 case ISD::FABS: // Begin 1 operand
36318 case ISD::FSQRT:
36319 case ISD::FRINT:
36320 case ISD::FCEIL:
36321 case ISD::FTRUNC:
36322 case ISD::FNEARBYINT:
36323 case ISD::FROUND:
36324 case ISD::FFLOOR:
36325 case X86ISD::FRCP:
36326 case X86ISD::FRSQRT: {
36327 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
36328 SDLoc DL(ExtElt);
36329 SmallVector<SDValue, 4> ExtOps;
36330 for (SDValue Op : Vec->ops())
36331 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
36332 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
36333 }
36334 default:
36335 return SDValue();
36336 }
36337 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36337)
;
36338}
36339
36340/// Try to convert a vector reduction sequence composed of binops and shuffles
36341/// into horizontal ops.
36342static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
36343 const X86Subtarget &Subtarget) {
36344 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Unexpected caller") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36344, __PRETTY_FUNCTION__))
;
36345
36346 // We need at least SSE2 to anything here.
36347 if (!Subtarget.hasSSE2())
36348 return SDValue();
36349
36350 ISD::NodeType Opc;
36351 SDValue Rdx =
36352 DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
36353 if (!Rdx)
36354 return SDValue();
36355
36356 SDValue Index = ExtElt->getOperand(1);
36357 assert(isNullConstant(Index) &&((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36358, __PRETTY_FUNCTION__))
36358 "Reduction doesn't end in an extract from index 0")((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36358, __PRETTY_FUNCTION__))
;
36359
36360 EVT VT = ExtElt->getValueType(0);
36361 EVT VecVT = Rdx.getValueType();
36362 if (VecVT.getScalarType() != VT)
36363 return SDValue();
36364
36365 SDLoc DL(ExtElt);
36366
36367 // vXi8 reduction - sub 128-bit vector.
36368 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
36369 if (VecVT == MVT::v4i8) {
36370 // Pad with zero.
36371 if (Subtarget.hasSSE41()) {
36372 Rdx = DAG.getBitcast(MVT::i32, Rdx);
36373 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
36374 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
36375 DAG.getIntPtrConstant(0, DL));
36376 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
36377 } else {
36378 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
36379 DAG.getConstant(0, DL, VecVT));
36380 }
36381 }
36382 if (Rdx.getValueType() == MVT::v8i8) {
36383 // Pad with undef.
36384 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
36385 DAG.getUNDEF(MVT::v8i8));
36386 }
36387 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
36388 DAG.getConstant(0, DL, MVT::v16i8));
36389 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
36390 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
36391 }
36392
36393 // Must be a >=128-bit vector with pow2 elements.
36394 if ((VecVT.getSizeInBits() % 128) != 0 ||
36395 !isPowerOf2_32(VecVT.getVectorNumElements()))
36396 return SDValue();
36397
36398 // vXi8 reduction - sum lo/hi halves then use PSADBW.
36399 if (VT == MVT::i8) {
36400 while (Rdx.getValueSizeInBits() > 128) {
36401 unsigned HalfSize = VecVT.getSizeInBits() / 2;
36402 unsigned HalfElts = VecVT.getVectorNumElements() / 2;
36403 SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
36404 SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
36405 Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
36406 VecVT = Rdx.getValueType();
36407 }
36408 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((VecVT == MVT::v16i8 && "v16i8 reduction expected") ?
static_cast<void> (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36408, __PRETTY_FUNCTION__))
;
36409
36410 SDValue Hi = DAG.getVectorShuffle(
36411 MVT::v16i8, DL, Rdx, Rdx,
36412 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
36413 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
36414 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
36415 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
36416 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
36417 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
36418 }
36419
36420 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
36421 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
36422 if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
36423 return SDValue();
36424
36425 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
36426
36427 // 256-bit horizontal instructions operate on 128-bit chunks rather than
36428 // across the whole vector, so we need an extract + hop preliminary stage.
36429 // This is the only step where the operands of the hop are not the same value.
36430 // TODO: We could extend this to handle 512-bit or even longer vectors.
36431 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
36432 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
36433 unsigned NumElts = VecVT.getVectorNumElements();
36434 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
36435 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
36436 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
36437 VecVT = Rdx.getValueType();
36438 }
36439 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
36440 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
36441 return SDValue();
36442
36443 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
36444 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
36445 for (unsigned i = 0; i != ReductionSteps; ++i)
36446 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
36447
36448 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
36449}
36450
36451/// Detect vector gather/scatter index generation and convert it from being a
36452/// bunch of shuffles and extracts into a somewhat faster sequence.
36453/// For i686, the best sequence is apparently storing the value and loading
36454/// scalars back, while for x64 we should use 64-bit extracts and shifts.
36455static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
36456 TargetLowering::DAGCombinerInfo &DCI,
36457 const X86Subtarget &Subtarget) {
36458 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
36459 return NewOp;
36460
36461 SDValue InputVector = N->getOperand(0);
36462 SDValue EltIdx = N->getOperand(1);
36463 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
36464
36465 EVT SrcVT = InputVector.getValueType();
36466 EVT VT = N->getValueType(0);
36467 SDLoc dl(InputVector);
36468 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
36469
36470 if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
36471 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
36472
36473 // Integer Constant Folding.
36474 if (CIdx && VT.isInteger()) {
36475 APInt UndefVecElts;
36476 SmallVector<APInt, 16> EltBits;
36477 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
36478 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
36479 EltBits, true, false)) {
36480 uint64_t Idx = CIdx->getZExtValue();
36481 if (UndefVecElts[Idx])
36482 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
36483 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
36484 dl, VT);
36485 }
36486 }
36487
36488 if (IsPextr) {
36489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36490 if (TLI.SimplifyDemandedBits(
36491 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
36492 return SDValue(N, 0);
36493
36494 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
36495 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
36496 InputVector.getOpcode() == X86ISD::PINSRW) &&
36497 InputVector.getOperand(2) == EltIdx) {
36498 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36499, __PRETTY_FUNCTION__))
36499 "Vector type mismatch")((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36499, __PRETTY_FUNCTION__))
;
36500 SDValue Scl = InputVector.getOperand(1);
36501 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
36502 return DAG.getZExtOrTrunc(Scl, dl, VT);
36503 }
36504
36505 // TODO - Remove this once we can handle the implicit zero-extension of
36506 // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
36507 // combineHorizontalPredicateResult and combineBasicSADPattern.
36508 return SDValue();
36509 }
36510
36511 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
36512 return NewOp;
36513
36514 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
36515 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
36516 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
36517 SDValue MMXSrc = InputVector.getOperand(0);
36518
36519 // The bitcast source is a direct mmx result.
36520 if (MMXSrc.getValueType() == MVT::x86mmx)
36521 return DAG.getBitcast(VT, InputVector);
36522 }
36523
36524 // Detect mmx to i32 conversion through a v2i32 elt extract.
36525 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
36526 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
36527 SDValue MMXSrc = InputVector.getOperand(0);
36528
36529 // The bitcast source is a direct mmx result.
36530 if (MMXSrc.getValueType() == MVT::x86mmx)
36531 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
36532 }
36533
36534 // Check whether this extract is the root of a sum of absolute differences
36535 // pattern. This has to be done here because we really want it to happen
36536 // pre-legalization,
36537 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
36538 return SAD;
36539
36540 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
36541 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
36542 return Cmp;
36543
36544 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
36545 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
36546 return MinMax;
36547
36548 if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
36549 return V;
36550
36551 if (SDValue V = scalarizeExtEltFP(N, DAG))
36552 return V;
36553
36554 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
36555 // and then testing the relevant element.
36556 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
36557 SmallVector<SDNode *, 16> BoolExtracts;
36558 auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
36559 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
36560 isa<ConstantSDNode>(Use->getOperand(1)) &&
36561 Use->getValueType(0) == MVT::i1) {
36562 BoolExtracts.push_back(Use);
36563 return true;
36564 }
36565 return false;
36566 };
36567 if (all_of(InputVector->uses(), IsBoolExtract) &&
36568 BoolExtracts.size() > 1) {
36569 unsigned NumSrcElts = SrcVT.getVectorNumElements();
36570 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
36571 if (SDValue BC =
36572 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
36573 for (SDNode *Use : BoolExtracts) {
36574 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
36575 unsigned MaskIdx = Use->getConstantOperandVal(1);
36576 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
36577 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
36578 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
36579 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
36580 DCI.CombineTo(Use, Res);
36581 }
36582 return SDValue(N, 0);
36583 }
36584 }
36585 }
36586
36587 return SDValue();
36588}
36589
36590/// If a vector select has an operand that is -1 or 0, try to simplify the
36591/// select to a bitwise logic operation.
36592/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
36593static SDValue
36594combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
36595 TargetLowering::DAGCombinerInfo &DCI,
36596 const X86Subtarget &Subtarget) {
36597 SDValue Cond = N->getOperand(0);
36598 SDValue LHS = N->getOperand(1);
36599 SDValue RHS = N->getOperand(2);
36600 EVT VT = LHS.getValueType();
36601 EVT CondVT = Cond.getValueType();
36602 SDLoc DL(N);
36603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36604
36605 if (N->getOpcode() != ISD::VSELECT)
36606 return SDValue();
36607
36608 assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36608, __PRETTY_FUNCTION__))
;
36609
36610 // Check if the first operand is all zeros and Cond type is vXi1.
36611 // This situation only applies to avx512.
36612 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
36613 // TODO: Can we assert that both operands are not zeros (because that should
36614 // get simplified at node creation time)?
36615 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
36616 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
36617
36618 // If both inputs are 0/undef, create a complete zero vector.
36619 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
36620 if (TValIsAllZeros && FValIsAllZeros) {
36621 if (VT.isFloatingPoint())
36622 return DAG.getConstantFP(0.0, DL, VT);
36623 return DAG.getConstant(0, DL, VT);
36624 }
36625
36626 if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
36627 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
36628 // Invert the cond to not(cond) : xor(op,allones)=not(op)
36629 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
36630 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
36631 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
36632 }
36633
36634 // To use the condition operand as a bitwise mask, it must have elements that
36635 // are the same size as the select elements. Ie, the condition operand must
36636 // have already been promoted from the IR select condition type <N x i1>.
36637 // Don't check if the types themselves are equal because that excludes
36638 // vector floating-point selects.
36639 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
36640 return SDValue();
36641
36642 // Try to invert the condition if true value is not all 1s and false value is
36643 // not all 0s. Only do this if the condition has one use.
36644 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
36645 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
36646 // Check if the selector will be produced by CMPP*/PCMP*.
36647 Cond.getOpcode() == ISD::SETCC &&
36648 // Check if SETCC has already been promoted.
36649 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
36650 CondVT) {
36651 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
36652
36653 if (TValIsAllZeros || FValIsAllOnes) {
36654 SDValue CC = Cond.getOperand(2);
36655 ISD::CondCode NewCC =
36656 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
36657 Cond.getOperand(0).getValueType().isInteger());
36658 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
36659 NewCC);
36660 std::swap(LHS, RHS);
36661 TValIsAllOnes = FValIsAllOnes;
36662 FValIsAllZeros = TValIsAllZeros;
36663 }
36664 }
36665
36666 // Cond value must be 'sign splat' to be converted to a logical op.
36667 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
36668 return SDValue();
36669
36670 // vselect Cond, 111..., 000... -> Cond
36671 if (TValIsAllOnes && FValIsAllZeros)
36672 return DAG.getBitcast(VT, Cond);
36673
36674 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
36675 return SDValue();
36676
36677 // vselect Cond, 111..., X -> or Cond, X
36678 if (TValIsAllOnes) {
36679 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
36680 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
36681 return DAG.getBitcast(VT, Or);
36682 }
36683
36684 // vselect Cond, X, 000... -> and Cond, X
36685 if (FValIsAllZeros) {
36686 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
36687 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
36688 return DAG.getBitcast(VT, And);
36689 }
36690
36691 // vselect Cond, 000..., X -> andn Cond, X
36692 if (TValIsAllZeros) {
36693 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
36694 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
36695 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
36696 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
36697 return DAG.getBitcast(VT, AndN);
36698 }
36699
36700 return SDValue();
36701}
36702
36703/// If both arms of a vector select are concatenated vectors, split the select,
36704/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
36705/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
36706/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
36707static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
36708 const X86Subtarget &Subtarget) {
36709 unsigned Opcode = N->getOpcode();
36710 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
36711 return SDValue();
36712
36713 // TODO: Split 512-bit vectors too?
36714 EVT VT = N->getValueType(0);
36715 if (!VT.is256BitVector())
36716 return SDValue();
36717
36718 // TODO: Split as long as any 2 of the 3 operands are concatenated?
36719 SDValue Cond = N->getOperand(0);
36720 SDValue TVal = N->getOperand(1);
36721 SDValue FVal = N->getOperand(2);
36722 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
36723 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
36724 !collectConcatOps(TVal.getNode(), CatOpsT) ||
36725 !collectConcatOps(FVal.getNode(), CatOpsF))
36726 return SDValue();
36727
36728 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
36729 ArrayRef<SDValue> Ops) {
36730 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
36731 };
36732 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
36733 makeBlend, /*CheckBWI*/ false);
36734}
36735
36736static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
36737 SDValue Cond = N->getOperand(0);
36738 SDValue LHS = N->getOperand(1);
36739 SDValue RHS = N->getOperand(2);
36740 SDLoc DL(N);
36741
36742 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
36743 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
36744 if (!TrueC || !FalseC)
36745 return SDValue();
36746
36747 // Don't do this for crazy integer types.
36748 EVT VT = N->getValueType(0);
36749 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36750 return SDValue();
36751
36752 // We're going to use the condition bit in math or logic ops. We could allow
36753 // this with a wider condition value (post-legalization it becomes an i8),
36754 // but if nothing is creating selects that late, it doesn't matter.
36755 if (Cond.getValueType() != MVT::i1)
36756 return SDValue();
36757
36758 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
36759 // 3, 5, or 9 with i32/i64, so those get transformed too.
36760 // TODO: For constants that overflow or do not differ by power-of-2 or small
36761 // multiplier, convert to 'and' + 'add'.
36762 const APInt &TrueVal = TrueC->getAPIntValue();
36763 const APInt &FalseVal = FalseC->getAPIntValue();
36764 bool OV;
36765 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
36766 if (OV)
36767 return SDValue();
36768
36769 APInt AbsDiff = Diff.abs();
36770 if (AbsDiff.isPowerOf2() ||
36771 ((VT == MVT::i32 || VT == MVT::i64) &&
36772 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
36773
36774 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
36775 // of the condition can usually be folded into a compare predicate, but even
36776 // without that, the sequence should be cheaper than a CMOV alternative.
36777 if (TrueVal.slt(FalseVal)) {
36778 Cond = DAG.getNOT(DL, Cond, MVT::i1);
36779 std::swap(TrueC, FalseC);
36780 }
36781
36782 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
36783 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
36784
36785 // Multiply condition by the difference if non-one.
36786 if (!AbsDiff.isOneValue())
36787 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
36788
36789 // Add the base if non-zero.
36790 if (!FalseC->isNullValue())
36791 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
36792
36793 return R;
36794 }
36795
36796 return SDValue();
36797}
36798
36799/// If this is a *dynamic* select (non-constant condition) and we can match
36800/// this node with one of the variable blend instructions, restructure the
36801/// condition so that blends can use the high (sign) bit of each element.
36802/// This function will also call SimplifyDemandedBits on already created
36803/// BLENDV to perform additional simplifications.
36804static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
36805 TargetLowering::DAGCombinerInfo &DCI,
36806 const X86Subtarget &Subtarget) {
36807 SDValue Cond = N->getOperand(0);
36808 if ((N->getOpcode() != ISD::VSELECT &&
36809 N->getOpcode() != X86ISD::BLENDV) ||
36810 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
36811 return SDValue();
36812
36813 // Don't optimize before the condition has been transformed to a legal type
36814 // and don't ever optimize vector selects that map to AVX512 mask-registers.
36815 unsigned BitWidth = Cond.getScalarValueSizeInBits();
36816 if (BitWidth < 8 || BitWidth > 64)
36817 return SDValue();
36818
36819 // We can only handle the cases where VSELECT is directly legal on the
36820 // subtarget. We custom lower VSELECT nodes with constant conditions and
36821 // this makes it hard to see whether a dynamic VSELECT will correctly
36822 // lower, so we both check the operation's status and explicitly handle the
36823 // cases where a *dynamic* blend will fail even though a constant-condition
36824 // blend could be custom lowered.
36825 // FIXME: We should find a better way to handle this class of problems.
36826 // Potentially, we should combine constant-condition vselect nodes
36827 // pre-legalization into shuffles and not mark as many types as custom
36828 // lowered.
36829 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36830 EVT VT = N->getValueType(0);
36831 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
36832 return SDValue();
36833 // FIXME: We don't support i16-element blends currently. We could and
36834 // should support them by making *all* the bits in the condition be set
36835 // rather than just the high bit and using an i8-element blend.
36836 if (VT.getVectorElementType() == MVT::i16)
36837 return SDValue();
36838 // Dynamic blending was only available from SSE4.1 onward.
36839 if (VT.is128BitVector() && !Subtarget.hasSSE41())
36840 return SDValue();
36841 // Byte blends are only available in AVX2
36842 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
36843 return SDValue();
36844 // There are no 512-bit blend instructions that use sign bits.
36845 if (VT.is512BitVector())
36846 return SDValue();
36847
36848 // TODO: Add other opcodes eventually lowered into BLEND.
36849 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
36850 UI != UE; ++UI)
36851 if ((UI->getOpcode() != ISD::VSELECT &&
36852 UI->getOpcode() != X86ISD::BLENDV) ||
36853 UI.getOperandNo() != 0)
36854 return SDValue();
36855
36856 APInt DemandedMask(APInt::getSignMask(BitWidth));
36857 KnownBits Known;
36858 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36859 !DCI.isBeforeLegalizeOps());
36860 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
36861 return SDValue();
36862
36863 // If we changed the computation somewhere in the DAG, this change will
36864 // affect all users of Cond. Update all the nodes so that we do not use
36865 // the generic VSELECT anymore. Otherwise, we may perform wrong
36866 // optimizations as we messed with the actual expectation for the vector
36867 // boolean values.
36868 for (SDNode *U : Cond->uses()) {
36869 if (U->getOpcode() == X86ISD::BLENDV)
36870 continue;
36871
36872 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
36873 Cond, U->getOperand(1), U->getOperand(2));
36874 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
36875 DCI.AddToWorklist(U);
36876 }
36877 DCI.CommitTargetLoweringOpt(TLO);
36878 return SDValue(N, 0);
36879}
36880
36881/// Do target-specific dag combines on SELECT and VSELECT nodes.
36882static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
36883 TargetLowering::DAGCombinerInfo &DCI,
36884 const X86Subtarget &Subtarget) {
36885 SDLoc DL(N);
36886 SDValue Cond = N->getOperand(0);
36887 SDValue LHS = N->getOperand(1);
36888 SDValue RHS = N->getOperand(2);
36889
36890 // Try simplification again because we use this function to optimize
36891 // BLENDV nodes that are not handled by the generic combiner.
36892 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
36893 return V;
36894
36895 EVT VT = LHS.getValueType();
36896 EVT CondVT = Cond.getValueType();
36897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36898
36899 // Convert vselects with constant condition into shuffles.
36900 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
36901 DCI.isBeforeLegalizeOps()) {
36902 SmallVector<int, 64> Mask;
36903 if (createShuffleMaskFromVSELECT(Mask, Cond))
36904 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
36905 }
36906
36907 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
36908 // instructions match the semantics of the common C idiom x<y?x:y but not
36909 // x<=y?x:y, because of how they handle negative zero (which can be
36910 // ignored in unsafe-math mode).
36911 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
36912 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
36913 VT != MVT::f80 && VT != MVT::f128 &&
36914 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
36915 (Subtarget.hasSSE2() ||
36916 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
36917 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
36918
36919 unsigned Opcode = 0;
36920 // Check for x CC y ? x : y.
36921 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
36922 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
36923 switch (CC) {
36924 default: break;
36925 case ISD::SETULT:
36926 // Converting this to a min would handle NaNs incorrectly, and swapping
36927 // the operands would cause it to handle comparisons between positive
36928 // and negative zero incorrectly.
36929 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
36930 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
36931 !(DAG.isKnownNeverZeroFloat(LHS) ||
36932 DAG.isKnownNeverZeroFloat(RHS)))
36933 break;
36934 std::swap(LHS, RHS);
36935 }
36936 Opcode = X86ISD::FMIN;
36937 break;
36938 case ISD::SETOLE:
36939 // Converting this to a min would handle comparisons between positive
36940 // and negative zero incorrectly.
36941 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
36942 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
36943 break;
36944 Opcode = X86ISD::FMIN;
36945 break;
36946 case ISD::SETULE:
36947 // Converting this to a min would handle both negative zeros and NaNs
36948 // incorrectly, but we can swap the operands to fix both.
36949 std::swap(LHS, RHS);
36950 LLVM_FALLTHROUGH[[gnu::fallthrough]];
36951 case ISD::SETOLT:
36952 case ISD::SETLT:
36953 case ISD::SETLE:
36954 Opcode = X86ISD::FMIN;
36955 break;
36956
36957 case ISD::SETOGE:
36958 // Converting this to a max would handle comparisons between positive
36959 // and negative zero incorrectly.
36960 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
36961 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
36962 break;
36963 Opcode = X86ISD::FMAX;
36964 break;
36965 case ISD::SETUGT:
36966 // Converting this to a max would handle NaNs incorrectly, and swapping
36967 // the operands would cause it to handle comparisons between positive
36968 // and negative zero incorrectly.
36969 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
36970 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
36971 !(DAG.isKnownNeverZeroFloat(LHS) ||
36972 DAG.isKnownNeverZeroFloat(RHS)))
36973 break;
36974 std::swap(LHS, RHS);
36975 }
36976 Opcode = X86ISD::FMAX;
36977 break;
36978 case ISD::SETUGE:
36979 // Converting this to a max would handle both negative zeros and NaNs
36980 // incorrectly, but we can swap the operands to fix both.
36981 std::swap(LHS, RHS);
36982 LLVM_FALLTHROUGH[[gnu::fallthrough]];
36983 case ISD::SETOGT:
36984 case ISD::SETGT:
36985 case ISD::SETGE:
36986 Opcode = X86ISD::FMAX;
36987 break;
36988 }
36989 // Check for x CC y ? y : x -- a min/max with reversed arms.
36990 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
36991 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
36992 switch (CC) {
36993 default: break;
36994 case ISD::SETOGE:
36995 // Converting this to a min would handle comparisons between positive
36996 // and negative zero incorrectly, and swapping the operands would
36997 // cause it to handle NaNs incorrectly.
36998 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
36999 !(DAG.isKnownNeverZeroFloat(LHS) ||
37000 DAG.isKnownNeverZeroFloat(RHS))) {
37001 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
37002 break;
37003 std::swap(LHS, RHS);
37004 }
37005 Opcode = X86ISD::FMIN;
37006 break;
37007 case ISD::SETUGT:
37008 // Converting this to a min would handle NaNs incorrectly.
37009 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
37010 break;
37011 Opcode = X86ISD::FMIN;
37012 break;
37013 case ISD::SETUGE:
37014 // Converting this to a min would handle both negative zeros and NaNs
37015 // incorrectly, but we can swap the operands to fix both.
37016 std::swap(LHS, RHS);
37017 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37018 case ISD::SETOGT:
37019 case ISD::SETGT:
37020 case ISD::SETGE:
37021 Opcode = X86ISD::FMIN;
37022 break;
37023
37024 case ISD::SETULT:
37025 // Converting this to a max would handle NaNs incorrectly.
37026 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
37027 break;
37028 Opcode = X86ISD::FMAX;
37029 break;
37030 case ISD::SETOLE:
37031 // Converting this to a max would handle comparisons between positive
37032 // and negative zero incorrectly, and swapping the operands would
37033 // cause it to handle NaNs incorrectly.
37034 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
37035 !DAG.isKnownNeverZeroFloat(LHS) &&
37036 !DAG.isKnownNeverZeroFloat(RHS)) {
37037 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
37038 break;
37039 std::swap(LHS, RHS);
37040 }
37041 Opcode = X86ISD::FMAX;
37042 break;
37043 case ISD::SETULE:
37044 // Converting this to a max would handle both negative zeros and NaNs
37045 // incorrectly, but we can swap the operands to fix both.
37046 std::swap(LHS, RHS);
37047 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37048 case ISD::SETOLT:
37049 case ISD::SETLT:
37050 case ISD::SETLE:
37051 Opcode = X86ISD::FMAX;
37052 break;
37053 }
37054 }
37055
37056 if (Opcode)
37057 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
37058 }
37059
37060 // Some mask scalar intrinsics rely on checking if only one bit is set
37061 // and implement it in C code like this:
37062 // A[0] = (U & 1) ? A[0] : W[0];
37063 // This creates some redundant instructions that break pattern matching.
37064 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
37065 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
37066 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
37067 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
37068 SDValue AndNode = Cond.getOperand(0);
37069 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
37070 isNullConstant(Cond.getOperand(1)) &&
37071 isOneConstant(AndNode.getOperand(1))) {
37072 // LHS and RHS swapped due to
37073 // setcc outputting 1 when AND resulted in 0 and vice versa.
37074 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
37075 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
37076 }
37077 }
37078
37079 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
37080 // lowering on KNL. In this case we convert it to
37081 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
37082 // The same situation all vectors of i8 and i16 without BWI.
37083 // Make sure we extend these even before type legalization gets a chance to
37084 // split wide vectors.
37085 // Since SKX these selects have a proper lowering.
37086 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
37087 CondVT.getVectorElementType() == MVT::i1 &&
37088 (VT.getVectorElementType() == MVT::i8 ||
37089 VT.getVectorElementType() == MVT::i16)) {
37090 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
37091 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
37092 }
37093
37094 // AVX512 - Extend select with zero to merge with target shuffle.
37095 // select(mask, extract_subvector(shuffle(x)), zero) -->
37096 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
37097 // TODO - support non target shuffles as well.
37098 if (Subtarget.hasAVX512() && CondVT.isVector() &&
37099 CondVT.getVectorElementType() == MVT::i1) {
37100 auto SelectableOp = [&TLI](SDValue Op) {
37101 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37102 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
37103 isNullConstant(Op.getOperand(1)) &&
37104 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
37105 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
37106 };
37107
37108 bool SelectableLHS = SelectableOp(LHS);
37109 bool SelectableRHS = SelectableOp(RHS);
37110 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
37111 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
37112
37113 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
37114 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
37115 : RHS.getOperand(0).getValueType();
37116 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37117 EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
37118 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
37119 VT.getSizeInBits());
37120 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
37121 VT.getSizeInBits());
37122 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
37123 DAG.getUNDEF(SrcCondVT), Cond,
37124 DAG.getIntPtrConstant(0, DL));
37125 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
37126 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
37127 }
37128 }
37129
37130 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
37131 return V;
37132
37133 // Canonicalize max and min:
37134 // (x > y) ? x : y -> (x >= y) ? x : y
37135 // (x < y) ? x : y -> (x <= y) ? x : y
37136 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
37137 // the need for an extra compare
37138 // against zero. e.g.
37139 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
37140 // subl %esi, %edi
37141 // testl %edi, %edi
37142 // movl $0, %eax
37143 // cmovgl %edi, %eax
37144 // =>
37145 // xorl %eax, %eax
37146 // subl %esi, $edi
37147 // cmovsl %eax, %edi
37148 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
37149 Cond.hasOneUse() &&
37150 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
37151 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
37152 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
37153 switch (CC) {
37154 default: break;
37155 case ISD::SETLT:
37156 case ISD::SETGT: {
37157 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
37158 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
37159 Cond.getOperand(0), Cond.getOperand(1), NewCC);
37160 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
37161 }
37162 }
37163 }
37164
37165 // Match VSELECTs into subs with unsigned saturation.
37166 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
37167 // psubus is available in SSE2 for i8 and i16 vectors.
37168 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
37169 isPowerOf2_32(VT.getVectorNumElements()) &&
37170 (VT.getVectorElementType() == MVT::i8 ||
37171 VT.getVectorElementType() == MVT::i16)) {
37172 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
37173
37174 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
37175 // left side invert the predicate to simplify logic below.
37176 SDValue Other;
37177 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
37178 Other = RHS;
37179 CC = ISD::getSetCCInverse(CC, true);
37180 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
37181 Other = LHS;
37182 }
37183
37184 if (Other.getNode() && Other->getNumOperands() == 2 &&
37185 Other->getOperand(0) == Cond.getOperand(0)) {
37186 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
37187 SDValue CondRHS = Cond->getOperand(1);
37188
37189 // Look for a general sub with unsigned saturation first.
37190 // x >= y ? x-y : 0 --> subus x, y
37191 // x > y ? x-y : 0 --> subus x, y
37192 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
37193 Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
37194 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
37195
37196 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
37197 if (isa<BuildVectorSDNode>(CondRHS)) {
37198 // If the RHS is a constant we have to reverse the const
37199 // canonicalization.
37200 // x > C-1 ? x+-C : 0 --> subus x, C
37201 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
37202 return (!Op && !Cond) ||
37203 (Op && Cond &&
37204 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
37205 };
37206 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
37207 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
37208 /*AllowUndefs*/ true)) {
37209 OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
37210 OpRHS);
37211 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
37212 }
37213
37214 // Another special case: If C was a sign bit, the sub has been
37215 // canonicalized into a xor.
37216 // FIXME: Would it be better to use computeKnownBits to determine
37217 // whether it's safe to decanonicalize the xor?
37218 // x s< 0 ? x^C : 0 --> subus x, C
37219 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
37220 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
37221 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
37222 OpRHSConst->getAPIntValue().isSignMask()) {
37223 // Note that we have to rebuild the RHS constant here to ensure we
37224 // don't rely on particular values of undef lanes.
37225 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
37226 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
37227 }
37228 }
37229 }
37230 }
37231 }
37232 }
37233
37234 // Match VSELECTs into add with unsigned saturation.
37235 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
37236 // paddus is available in SSE2 for i8 and i16 vectors.
37237 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
37238 isPowerOf2_32(VT.getVectorNumElements()) &&
37239 (VT.getVectorElementType() == MVT::i8 ||
37240 VT.getVectorElementType() == MVT::i16)) {
37241 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
37242
37243 SDValue CondLHS = Cond->getOperand(0);
37244 SDValue CondRHS = Cond->getOperand(1);
37245
37246 // Check if one of the arms of the VSELECT is vector with all bits set.
37247 // If it's on the left side invert the predicate to simplify logic below.
37248 SDValue Other;
37249 if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
37250 Other = RHS;
37251 CC = ISD::getSetCCInverse(CC, true);
37252 } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
37253 Other = LHS;
37254 }
37255
37256 if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
37257 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
37258
37259 // Canonicalize condition operands.
37260 if (CC == ISD::SETUGE) {
37261 std::swap(CondLHS, CondRHS);
37262 CC = ISD::SETULE;
37263 }
37264
37265 // We can test against either of the addition operands.
37266 // x <= x+y ? x+y : ~0 --> addus x, y
37267 // x+y >= x ? x+y : ~0 --> addus x, y
37268 if (CC == ISD::SETULE && Other == CondRHS &&
37269 (OpLHS == CondLHS || OpRHS == CondLHS))
37270 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
37271
37272 if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
37273 CondLHS == OpLHS) {
37274 // If the RHS is a constant we have to reverse the const
37275 // canonicalization.
37276 // x > ~C ? x+C : ~0 --> addus x, C
37277 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
37278 return Cond->getAPIntValue() == ~Op->getAPIntValue();
37279 };
37280 if (CC == ISD::SETULE &&
37281 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
37282 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
37283 }
37284 }
37285 }
37286
37287 // Early exit check
37288 if (!TLI.isTypeLegal(VT))
37289 return SDValue();
37290
37291 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
37292 return V;
37293
37294 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
37295 return V;
37296
37297 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
37298 return V;
37299
37300 // select(~Cond, X, Y) -> select(Cond, Y, X)
37301 if (CondVT.getScalarType() != MVT::i1)
37302 if (SDValue CondNot = IsNOT(Cond, DAG))
37303 return DAG.getNode(N->getOpcode(), DL, VT,
37304 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
37305
37306 // Custom action for SELECT MMX
37307 if (VT == MVT::x86mmx) {
37308 LHS = DAG.getBitcast(MVT::i64, LHS);
37309 RHS = DAG.getBitcast(MVT::i64, RHS);
37310 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
37311 return DAG.getBitcast(VT, newSelect);
37312 }
37313
37314 return SDValue();
37315}
37316
37317/// Combine:
37318/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
37319/// to:
37320/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
37321/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
37322/// Note that this is only legal for some op/cc combinations.
37323static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
37324 SelectionDAG &DAG,
37325 const X86Subtarget &Subtarget) {
37326 // This combine only operates on CMP-like nodes.
37327 if (!(Cmp.getOpcode() == X86ISD::CMP ||
37328 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
37329 return SDValue();
37330
37331 // Can't replace the cmp if it has more uses than the one we're looking at.
37332 // FIXME: We would like to be able to handle this, but would need to make sure
37333 // all uses were updated.
37334 if (!Cmp.hasOneUse())
37335 return SDValue();
37336
37337 // This only applies to variations of the common case:
37338 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
37339 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
37340 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
37341 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
37342 // Using the proper condcodes (see below), overflow is checked for.
37343
37344 // FIXME: We can generalize both constraints:
37345 // - XOR/OR/AND (if they were made to survive AtomicExpand)
37346 // - LHS != 1
37347 // if the result is compared.
37348
37349 SDValue CmpLHS = Cmp.getOperand(0);
37350 SDValue CmpRHS = Cmp.getOperand(1);
37351
37352 if (!CmpLHS.hasOneUse())
37353 return SDValue();
37354
37355 unsigned Opc = CmpLHS.getOpcode();
37356 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
37357 return SDValue();
37358
37359 SDValue OpRHS = CmpLHS.getOperand(2);
37360 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
37361 if (!OpRHSC)
37362 return SDValue();
37363
37364 APInt Addend = OpRHSC->getAPIntValue();
37365 if (Opc == ISD::ATOMIC_LOAD_SUB)
37366 Addend = -Addend;
37367
37368 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
37369 if (!CmpRHSC)
37370 return SDValue();
37371
37372 APInt Comparison = CmpRHSC->getAPIntValue();
37373
37374 // If the addend is the negation of the comparison value, then we can do
37375 // a full comparison by emitting the atomic arithmetic as a locked sub.
37376 if (Comparison == -Addend) {
37377 // The CC is fine, but we need to rewrite the LHS of the comparison as an
37378 // atomic sub.
37379 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
37380 auto AtomicSub = DAG.getAtomic(
37381 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
37382 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
37383 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
37384 AN->getMemOperand());
37385 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
37386 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
37387 DAG.getUNDEF(CmpLHS.getValueType()));
37388 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
37389 return LockOp;
37390 }
37391
37392 // We can handle comparisons with zero in a number of cases by manipulating
37393 // the CC used.
37394 if (!Comparison.isNullValue())
37395 return SDValue();
37396
37397 if (CC == X86::COND_S && Addend == 1)
37398 CC = X86::COND_LE;
37399 else if (CC == X86::COND_NS && Addend == 1)
37400 CC = X86::COND_G;
37401 else if (CC == X86::COND_G && Addend == -1)
37402 CC = X86::COND_GE;
37403 else if (CC == X86::COND_LE && Addend == -1)
37404 CC = X86::COND_L;
37405 else
37406 return SDValue();
37407
37408 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
37409 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
37410 DAG.getUNDEF(CmpLHS.getValueType()));
37411 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
37412 return LockOp;
37413}
37414
37415// Check whether a boolean test is testing a boolean value generated by
37416// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
37417// code.
37418//
37419// Simplify the following patterns:
37420// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
37421// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
37422// to (Op EFLAGS Cond)
37423//
37424// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
37425// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
37426// to (Op EFLAGS !Cond)
37427//
37428// where Op could be BRCOND or CMOV.
37429//
37430static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
37431 // This combine only operates on CMP-like nodes.
37432 if (!(Cmp.getOpcode() == X86ISD::CMP ||
37433 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
37434 return SDValue();
37435
37436 // Quit if not used as a boolean value.
37437 if (CC != X86::COND_E && CC != X86::COND_NE)
37438 return SDValue();
37439
37440 // Check CMP operands. One of them should be 0 or 1 and the other should be
37441 // an SetCC or extended from it.
37442 SDValue Op1 = Cmp.getOperand(0);
37443 SDValue Op2 = Cmp.getOperand(1);
37444
37445 SDValue SetCC;
37446 const ConstantSDNode* C = nullptr;
37447 bool needOppositeCond = (CC == X86::COND_E);
37448 bool checkAgainstTrue = false; // Is it a comparison against 1?
37449
37450 if ((C = dyn_cast<ConstantSDNode>(Op1)))
37451 SetCC = Op2;
37452 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
37453 SetCC = Op1;
37454 else // Quit if all operands are not constants.
37455 return SDValue();
37456
37457 if (C->getZExtValue() == 1) {
37458 needOppositeCond = !needOppositeCond;
37459 checkAgainstTrue = true;
37460 } else if (C->getZExtValue() != 0)
37461 // Quit if the constant is neither 0 or 1.
37462 return SDValue();
37463
37464 bool truncatedToBoolWithAnd = false;
37465 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
37466 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
37467 SetCC.getOpcode() == ISD::TRUNCATE ||
37468 SetCC.getOpcode() == ISD::AND) {
37469 if (SetCC.getOpcode() == ISD::AND) {
37470 int OpIdx = -1;
37471 if (isOneConstant(SetCC.getOperand(0)))
37472 OpIdx = 1;
37473 if (isOneConstant(SetCC.getOperand(1)))
37474 OpIdx = 0;
37475 if (OpIdx < 0)
37476 break;
37477 SetCC = SetCC.getOperand(OpIdx);
37478 truncatedToBoolWithAnd = true;
37479 } else
37480 SetCC = SetCC.getOperand(0);
37481 }
37482
37483 switch (SetCC.getOpcode()) {
37484 case X86ISD::SETCC_CARRY:
37485 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
37486 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
37487 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
37488 // truncated to i1 using 'and'.
37489 if (checkAgainstTrue && !truncatedToBoolWithAnd)
37490 break;
37491 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37492, __PRETTY_FUNCTION__))
37492 "Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37492, __PRETTY_FUNCTION__))
;
37493 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37494 case X86ISD::SETCC:
37495 // Set the condition code or opposite one if necessary.
37496 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
37497 if (needOppositeCond)
37498 CC = X86::GetOppositeBranchCondition(CC);
37499 return SetCC.getOperand(1);
37500 case X86ISD::CMOV: {
37501 // Check whether false/true value has canonical one, i.e. 0 or 1.
37502 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
37503 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
37504 // Quit if true value is not a constant.
37505 if (!TVal)
37506 return SDValue();
37507 // Quit if false value is not a constant.
37508 if (!FVal) {
37509 SDValue Op = SetCC.getOperand(0);
37510 // Skip 'zext' or 'trunc' node.
37511 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
37512 Op.getOpcode() == ISD::TRUNCATE)
37513 Op = Op.getOperand(0);
37514 // A special case for rdrand/rdseed, where 0 is set if false cond is
37515 // found.
37516 if ((Op.getOpcode() != X86ISD::RDRAND &&
37517 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
37518 return SDValue();
37519 }
37520 // Quit if false value is not the constant 0 or 1.
37521 bool FValIsFalse = true;
37522 if (FVal && FVal->getZExtValue() != 0) {
37523 if (FVal->getZExtValue() != 1)
37524 return SDValue();
37525 // If FVal is 1, opposite cond is needed.
37526 needOppositeCond = !needOppositeCond;
37527 FValIsFalse = false;
37528 }
37529 // Quit if TVal is not the constant opposite of FVal.
37530 if (FValIsFalse && TVal->getZExtValue() != 1)
37531 return SDValue();
37532 if (!FValIsFalse && TVal->getZExtValue() != 0)
37533 return SDValue();
37534 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
37535 if (needOppositeCond)
37536 CC = X86::GetOppositeBranchCondition(CC);
37537 return SetCC.getOperand(3);
37538 }
37539 }
37540
37541 return SDValue();
37542}
37543
37544/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
37545/// Match:
37546/// (X86or (X86setcc) (X86setcc))
37547/// (X86cmp (and (X86setcc) (X86setcc)), 0)
37548static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
37549 X86::CondCode &CC1, SDValue &Flags,
37550 bool &isAnd) {
37551 if (Cond->getOpcode() == X86ISD::CMP) {
37552 if (!isNullConstant(Cond->getOperand(1)))
37553 return false;
37554
37555 Cond = Cond->getOperand(0);
37556 }
37557
37558 isAnd = false;
37559
37560 SDValue SetCC0, SetCC1;
37561 switch (Cond->getOpcode()) {
37562 default: return false;
37563 case ISD::AND:
37564 case X86ISD::AND:
37565 isAnd = true;
37566 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37567 case ISD::OR:
37568 case X86ISD::OR:
37569 SetCC0 = Cond->getOperand(0);
37570 SetCC1 = Cond->getOperand(1);
37571 break;
37572 };
37573
37574 // Make sure we have SETCC nodes, using the same flags value.
37575 if (SetCC0.getOpcode() != X86ISD::SETCC ||
37576 SetCC1.getOpcode() != X86ISD::SETCC ||
37577 SetCC0->getOperand(1) != SetCC1->getOperand(1))
37578 return false;
37579
37580 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
37581 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
37582 Flags = SetCC0->getOperand(1);
37583 return true;
37584}
37585
37586// When legalizing carry, we create carries via add X, -1
37587// If that comes from an actual carry, via setcc, we use the
37588// carry directly.
37589static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
37590 if (EFLAGS.getOpcode() == X86ISD::ADD) {
37591 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
37592 SDValue Carry = EFLAGS.getOperand(0);
37593 while (Carry.getOpcode() == ISD::TRUNCATE ||
37594 Carry.getOpcode() == ISD::ZERO_EXTEND ||
37595 Carry.getOpcode() == ISD::SIGN_EXTEND ||
37596 Carry.getOpcode() == ISD::ANY_EXTEND ||
37597 (Carry.getOpcode() == ISD::AND &&
37598 isOneConstant(Carry.getOperand(1))))
37599 Carry = Carry.getOperand(0);
37600 if (Carry.getOpcode() == X86ISD::SETCC ||
37601 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
37602 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
37603 uint64_t CarryCC = Carry.getConstantOperandVal(0);
37604 SDValue CarryOp1 = Carry.getOperand(1);
37605 if (CarryCC == X86::COND_B)
37606 return CarryOp1;
37607 if (CarryCC == X86::COND_A) {
37608 // Try to convert COND_A into COND_B in an attempt to facilitate
37609 // materializing "setb reg".
37610 //
37611 // Do not flip "e > c", where "c" is a constant, because Cmp
37612 // instruction cannot take an immediate as its first operand.
37613 //
37614 if (CarryOp1.getOpcode() == X86ISD::SUB &&
37615 CarryOp1.getNode()->hasOneUse() &&
37616 CarryOp1.getValueType().isInteger() &&
37617 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
37618 SDValue SubCommute =
37619 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
37620 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
37621 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
37622 }
37623 }
37624 // If this is a check of the z flag of an add with 1, switch to the
37625 // C flag.
37626 if (CarryCC == X86::COND_E &&
37627 CarryOp1.getOpcode() == X86ISD::ADD &&
37628 isOneConstant(CarryOp1.getOperand(1)))
37629 return CarryOp1;
37630 }
37631 }
37632 }
37633
37634 return SDValue();
37635}
37636
37637/// Optimize an EFLAGS definition used according to the condition code \p CC
37638/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
37639/// uses of chain values.
37640static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
37641 SelectionDAG &DAG,
37642 const X86Subtarget &Subtarget) {
37643 if (CC == X86::COND_B)
37644 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
37645 return Flags;
37646
37647 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
37648 return R;
37649 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
37650}
37651
37652/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
37653static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
37654 TargetLowering::DAGCombinerInfo &DCI,
37655 const X86Subtarget &Subtarget) {
37656 SDLoc DL(N);
37657
37658 SDValue FalseOp = N->getOperand(0);
37659 SDValue TrueOp = N->getOperand(1);
37660 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
37661 SDValue Cond = N->getOperand(3);
37662
37663 // cmov X, X, ?, ? --> X
37664 if (TrueOp == FalseOp)
37665 return TrueOp;
37666
37667 // Try to simplify the EFLAGS and condition code operands.
37668 // We can't always do this as FCMOV only supports a subset of X86 cond.
37669 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
37670 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
37671 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
37672 Flags};
37673 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
37674 }
37675 }
37676
37677 // If this is a select between two integer constants, try to do some
37678 // optimizations. Note that the operands are ordered the opposite of SELECT
37679 // operands.
37680 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
37681 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
37682 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
37683 // larger than FalseC (the false value).
37684 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
37685 CC = X86::GetOppositeBranchCondition(CC);
37686 std::swap(TrueC, FalseC);
37687 std::swap(TrueOp, FalseOp);
37688 }
37689
37690 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
37691 // This is efficient for any integer data type (including i8/i16) and
37692 // shift amount.
37693 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
37694 Cond = getSETCC(CC, Cond, DL, DAG);
37695
37696 // Zero extend the condition if needed.
37697 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
37698
37699 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
37700 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
37701 DAG.getConstant(ShAmt, DL, MVT::i8));
37702 return Cond;
37703 }
37704
37705 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
37706 // for any integer data type, including i8/i16.
37707 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
37708 Cond = getSETCC(CC, Cond, DL, DAG);
37709
37710 // Zero extend the condition if needed.
37711 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
37712 FalseC->getValueType(0), Cond);
37713 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
37714 SDValue(FalseC, 0));
37715 return Cond;
37716 }
37717
37718 // Optimize cases that will turn into an LEA instruction. This requires
37719 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
37720 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
37721 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
37722 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37723, __PRETTY_FUNCTION__))
37723 "Implicit constant truncation")((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37723, __PRETTY_FUNCTION__))
;
37724
37725 bool isFastMultiplier = false;
37726 if (Diff.ult(10)) {
37727 switch (Diff.getZExtValue()) {
37728 default: break;
37729 case 1: // result = add base, cond
37730 case 2: // result = lea base( , cond*2)
37731 case 3: // result = lea base(cond, cond*2)
37732 case 4: // result = lea base( , cond*4)
37733 case 5: // result = lea base(cond, cond*4)
37734 case 8: // result = lea base( , cond*8)
37735 case 9: // result = lea base(cond, cond*8)
37736 isFastMultiplier = true;
37737 break;
37738 }
37739 }
37740
37741 if (isFastMultiplier) {
37742 Cond = getSETCC(CC, Cond, DL ,DAG);
37743 // Zero extend the condition if needed.
37744 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
37745 Cond);
37746 // Scale the condition by the difference.
37747 if (Diff != 1)
37748 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
37749 DAG.getConstant(Diff, DL, Cond.getValueType()));
37750
37751 // Add the base if non-zero.
37752 if (FalseC->getAPIntValue() != 0)
37753 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
37754 SDValue(FalseC, 0));
37755 return Cond;
37756 }
37757 }
37758 }
37759 }
37760
37761 // Handle these cases:
37762 // (select (x != c), e, c) -> select (x != c), e, x),
37763 // (select (x == c), c, e) -> select (x == c), x, e)
37764 // where the c is an integer constant, and the "select" is the combination
37765 // of CMOV and CMP.
37766 //
37767 // The rationale for this change is that the conditional-move from a constant
37768 // needs two instructions, however, conditional-move from a register needs
37769 // only one instruction.
37770 //
37771 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
37772 // some instruction-combining opportunities. This opt needs to be
37773 // postponed as late as possible.
37774 //
37775 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
37776 // the DCI.xxxx conditions are provided to postpone the optimization as
37777 // late as possible.
37778
37779 ConstantSDNode *CmpAgainst = nullptr;
37780 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
37781 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
37782 !isa<ConstantSDNode>(Cond.getOperand(0))) {
37783
37784 if (CC == X86::COND_NE &&
37785 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
37786 CC = X86::GetOppositeBranchCondition(CC);
37787 std::swap(TrueOp, FalseOp);
37788 }
37789
37790 if (CC == X86::COND_E &&
37791 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
37792 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
37793 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
37794 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
37795 }
37796 }
37797 }
37798
37799 // Fold and/or of setcc's to double CMOV:
37800 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
37801 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
37802 //
37803 // This combine lets us generate:
37804 // cmovcc1 (jcc1 if we don't have CMOV)
37805 // cmovcc2 (same)
37806 // instead of:
37807 // setcc1
37808 // setcc2
37809 // and/or
37810 // cmovne (jne if we don't have CMOV)
37811 // When we can't use the CMOV instruction, it might increase branch
37812 // mispredicts.
37813 // When we can use CMOV, or when there is no mispredict, this improves
37814 // throughput and reduces register pressure.
37815 //
37816 if (CC == X86::COND_NE) {
37817 SDValue Flags;
37818 X86::CondCode CC0, CC1;
37819 bool isAndSetCC;
37820 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
37821 if (isAndSetCC) {
37822 std::swap(FalseOp, TrueOp);
37823 CC0 = X86::GetOppositeBranchCondition(CC0);
37824 CC1 = X86::GetOppositeBranchCondition(CC1);
37825 }
37826
37827 SDValue LOps[] = {FalseOp, TrueOp,
37828 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
37829 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
37830 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
37831 Flags};
37832 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
37833 return CMOV;
37834 }
37835 }
37836
37837 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
37838 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
37839 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
37840 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
37841 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
37842 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
37843 SDValue Add = TrueOp;
37844 SDValue Const = FalseOp;
37845 // Canonicalize the condition code for easier matching and output.
37846 if (CC == X86::COND_E)
37847 std::swap(Add, Const);
37848
37849 // We might have replaced the constant in the cmov with the LHS of the
37850 // compare. If so change it to the RHS of the compare.
37851 if (Const == Cond.getOperand(0))
37852 Const = Cond.getOperand(1);
37853
37854 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
37855 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
37856 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
37857 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
37858 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
37859 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
37860 EVT VT = N->getValueType(0);
37861 // This should constant fold.
37862 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
37863 SDValue CMov =
37864 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
37865 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
37866 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
37867 }
37868 }
37869
37870 return SDValue();
37871}
37872
37873/// Different mul shrinking modes.
37874enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
37875
37876static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
37877 EVT VT = N->getOperand(0).getValueType();
37878 if (VT.getScalarSizeInBits() != 32)
37879 return false;
37880
37881 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37881, __PRETTY_FUNCTION__))
;
37882 unsigned SignBits[2] = {1, 1};
37883 bool IsPositive[2] = {false, false};
37884 for (unsigned i = 0; i < 2; i++) {
37885 SDValue Opd = N->getOperand(i);
37886
37887 SignBits[i] = DAG.ComputeNumSignBits(Opd);
37888 IsPositive[i] = DAG.SignBitIsZero(Opd);
37889 }
37890
37891 bool AllPositive = IsPositive[0] && IsPositive[1];
37892 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
37893 // When ranges are from -128 ~ 127, use MULS8 mode.
37894 if (MinSignBits >= 25)
37895 Mode = ShrinkMode::MULS8;
37896 // When ranges are from 0 ~ 255, use MULU8 mode.
37897 else if (AllPositive && MinSignBits >= 24)
37898 Mode = ShrinkMode::MULU8;
37899 // When ranges are from -32768 ~ 32767, use MULS16 mode.
37900 else if (MinSignBits >= 17)
37901 Mode = ShrinkMode::MULS16;
37902 // When ranges are from 0 ~ 65535, use MULU16 mode.
37903 else if (AllPositive && MinSignBits >= 16)
37904 Mode = ShrinkMode::MULU16;
37905 else
37906 return false;
37907 return true;
37908}
37909
37910/// When the operands of vector mul are extended from smaller size values,
37911/// like i8 and i16, the type of mul may be shrinked to generate more
37912/// efficient code. Two typical patterns are handled:
37913/// Pattern1:
37914/// %2 = sext/zext <N x i8> %1 to <N x i32>
37915/// %4 = sext/zext <N x i8> %3 to <N x i32>
37916// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
37917/// %5 = mul <N x i32> %2, %4
37918///
37919/// Pattern2:
37920/// %2 = zext/sext <N x i16> %1 to <N x i32>
37921/// %4 = zext/sext <N x i16> %3 to <N x i32>
37922/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
37923/// %5 = mul <N x i32> %2, %4
37924///
37925/// There are four mul shrinking modes:
37926/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
37927/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
37928/// generate pmullw+sext32 for it (MULS8 mode).
37929/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
37930/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
37931/// generate pmullw+zext32 for it (MULU8 mode).
37932/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
37933/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
37934/// generate pmullw+pmulhw for it (MULS16 mode).
37935/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
37936/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
37937/// generate pmullw+pmulhuw for it (MULU16 mode).
37938static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
37939 const X86Subtarget &Subtarget) {
37940 // Check for legality
37941 // pmullw/pmulhw are not supported by SSE.
37942 if (!Subtarget.hasSSE2())
37943 return SDValue();
37944
37945 // Check for profitability
37946 // pmulld is supported since SSE41. It is better to use pmulld
37947 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
37948 // the expansion.
37949 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
37950 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
37951 return SDValue();
37952
37953 ShrinkMode Mode;
37954 if (!canReduceVMulWidth(N, DAG, Mode))
37955 return SDValue();
37956
37957 SDLoc DL(N);
37958 SDValue N0 = N->getOperand(0);
37959 SDValue N1 = N->getOperand(1);
37960 EVT VT = N->getOperand(0).getValueType();
37961 unsigned NumElts = VT.getVectorNumElements();
37962 if ((NumElts % 2) != 0)
37963 return SDValue();
37964
37965 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
37966
37967 // Shrink the operands of mul.
37968 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
37969 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
37970
37971 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
37972 // lower part is needed.
37973 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
37974 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
37975 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
37976 : ISD::SIGN_EXTEND,
37977 DL, VT, MulLo);
37978
37979 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
37980 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
37981 // the higher part is also needed.
37982 SDValue MulHi =
37983 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
37984 ReducedVT, NewN0, NewN1);
37985
37986 // Repack the lower part and higher part result of mul into a wider
37987 // result.
37988 // Generate shuffle functioning as punpcklwd.
37989 SmallVector<int, 16> ShuffleMask(NumElts);
37990 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
37991 ShuffleMask[2 * i] = i;
37992 ShuffleMask[2 * i + 1] = i + NumElts;
37993 }
37994 SDValue ResLo =
37995 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
37996 ResLo = DAG.getBitcast(ResVT, ResLo);
37997 // Generate shuffle functioning as punpckhwd.
37998 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
37999 ShuffleMask[2 * i] = i + NumElts / 2;
38000 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
38001 }
38002 SDValue ResHi =
38003 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
38004 ResHi = DAG.getBitcast(ResVT, ResHi);
38005 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
38006}
38007
38008static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
38009 EVT VT, const SDLoc &DL) {
38010
38011 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
38012 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
38013 DAG.getConstant(Mult, DL, VT));
38014 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
38015 DAG.getConstant(Shift, DL, MVT::i8));
38016 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
38017 N->getOperand(0));
38018 return Result;
38019 };
38020
38021 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
38022 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
38023 DAG.getConstant(Mul1, DL, VT));
38024 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
38025 DAG.getConstant(Mul2, DL, VT));
38026 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
38027 N->getOperand(0));
38028 return Result;
38029 };
38030
38031 switch (MulAmt) {
38032 default:
38033 break;
38034 case 11:
38035 // mul x, 11 => add ((shl (mul x, 5), 1), x)
38036 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
38037 case 21:
38038 // mul x, 21 => add ((shl (mul x, 5), 2), x)
38039 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
38040 case 41:
38041 // mul x, 41 => add ((shl (mul x, 5), 3), x)
38042 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
38043 case 22:
38044 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
38045 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
38046 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
38047 case 19:
38048 // mul x, 19 => add ((shl (mul x, 9), 1), x)
38049 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
38050 case 37:
38051 // mul x, 37 => add ((shl (mul x, 9), 2), x)
38052 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
38053 case 73:
38054 // mul x, 73 => add ((shl (mul x, 9), 3), x)
38055 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
38056 case 13:
38057 // mul x, 13 => add ((shl (mul x, 3), 2), x)
38058 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
38059 case 23:
38060 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
38061 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
38062 case 26:
38063 // mul x, 26 => add ((mul (mul x, 5), 5), x)
38064 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
38065 case 28:
38066 // mul x, 28 => add ((mul (mul x, 9), 3), x)
38067 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
38068 case 29:
38069 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
38070 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
38071 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
38072 }
38073
38074 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
38075 // by a single LEA.
38076 // First check if this a sum of two power of 2s because that's easy. Then
38077 // count how many zeros are up to the first bit.
38078 // TODO: We can do this even without LEA at a cost of two shifts and an add.
38079 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
38080 unsigned ScaleShift = countTrailingZeros(MulAmt);
38081 if (ScaleShift >= 1 && ScaleShift < 4) {
38082 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
38083 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38084 DAG.getConstant(ShiftAmt, DL, MVT::i8));
38085 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38086 DAG.getConstant(ScaleShift, DL, MVT::i8));
38087 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
38088 }
38089 }
38090
38091 return SDValue();
38092}
38093
38094// If the upper 17 bits of each element are zero then we can use PMADDWD,
38095// which is always at least as quick as PMULLD, except on KNL.
38096static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
38097 const X86Subtarget &Subtarget) {
38098 if (!Subtarget.hasSSE2())
38099 return SDValue();
38100
38101 if (Subtarget.isPMADDWDSlow())
38102 return SDValue();
38103
38104 EVT VT = N->getValueType(0);
38105
38106 // Only support vXi32 vectors.
38107 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
38108 return SDValue();
38109
38110 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
38111 // Also allow v2i32 if it will be widened.
38112 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
38113 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
38114 return SDValue();
38115
38116 SDValue N0 = N->getOperand(0);
38117 SDValue N1 = N->getOperand(1);
38118
38119 // If we are zero extending two steps without SSE4.1, its better to reduce
38120 // the vmul width instead.
38121 if (!Subtarget.hasSSE41() &&
38122 (N0.getOpcode() == ISD::ZERO_EXTEND &&
38123 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
38124 (N1.getOpcode() == ISD::ZERO_EXTEND &&
38125 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
38126 return SDValue();
38127
38128 APInt Mask17 = APInt::getHighBitsSet(32, 17);
38129 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
38130 !DAG.MaskedValueIsZero(N0, Mask17))
38131 return SDValue();
38132
38133 // Use SplitOpsAndApply to handle AVX splitting.
38134 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38135 ArrayRef<SDValue> Ops) {
38136 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38137 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
38138 };
38139 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
38140 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
38141 PMADDWDBuilder);
38142}
38143
38144static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
38145 const X86Subtarget &Subtarget) {
38146 if (!Subtarget.hasSSE2())
38147 return SDValue();
38148
38149 EVT VT = N->getValueType(0);
38150
38151 // Only support vXi64 vectors.
38152 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
38153 VT.getVectorNumElements() < 2 ||
38154 !isPowerOf2_32(VT.getVectorNumElements()))
38155 return SDValue();
38156
38157 SDValue N0 = N->getOperand(0);
38158 SDValue N1 = N->getOperand(1);
38159
38160 // MULDQ returns the 64-bit result of the signed multiplication of the lower
38161 // 32-bits. We can lower with this if the sign bits stretch that far.
38162 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
38163 DAG.ComputeNumSignBits(N1) > 32) {
38164 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38165 ArrayRef<SDValue> Ops) {
38166 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
38167 };
38168 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
38169 PMULDQBuilder, /*CheckBWI*/false);
38170 }
38171
38172 // If the upper bits are zero we can use a single pmuludq.
38173 APInt Mask = APInt::getHighBitsSet(64, 32);
38174 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
38175 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38176 ArrayRef<SDValue> Ops) {
38177 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
38178 };
38179 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
38180 PMULUDQBuilder, /*CheckBWI*/false);
38181 }
38182
38183 return SDValue();
38184}
38185
38186/// Optimize a single multiply with constant into two operations in order to
38187/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
38188static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
38189 TargetLowering::DAGCombinerInfo &DCI,
38190 const X86Subtarget &Subtarget) {
38191 EVT VT = N->getValueType(0);
38192
38193 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
38194 return V;
38195
38196 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
38197 return V;
38198
38199 if (DCI.isBeforeLegalize() && VT.isVector())
38200 return reduceVMULWidth(N, DAG, Subtarget);
38201
38202 if (!MulConstantOptimization)
38203 return SDValue();
38204 // An imul is usually smaller than the alternative sequence.
38205 if (DAG.getMachineFunction().getFunction().hasMinSize())
38206 return SDValue();
38207
38208 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
38209 return SDValue();
38210
38211 if (VT != MVT::i64 && VT != MVT::i32)
38212 return SDValue();
38213
38214 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
38215 if (!C)
38216 return SDValue();
38217 if (isPowerOf2_64(C->getZExtValue()))
38218 return SDValue();
38219
38220 int64_t SignMulAmt = C->getSExtValue();
38221 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!"
) ? static_cast<void> (0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38221, __PRETTY_FUNCTION__))
;
38222 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
38223
38224 SDLoc DL(N);
38225 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
38226 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
38227 DAG.getConstant(AbsMulAmt, DL, VT));
38228 if (SignMulAmt < 0)
38229 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
38230 NewMul);
38231
38232 return NewMul;
38233 }
38234
38235 uint64_t MulAmt1 = 0;
38236 uint64_t MulAmt2 = 0;
38237 if ((AbsMulAmt % 9) == 0) {
38238 MulAmt1 = 9;
38239 MulAmt2 = AbsMulAmt / 9;
38240 } else if ((AbsMulAmt % 5) == 0) {
38241 MulAmt1 = 5;
38242 MulAmt2 = AbsMulAmt / 5;
38243 } else if ((AbsMulAmt % 3) == 0) {
38244 MulAmt1 = 3;
38245 MulAmt2 = AbsMulAmt / 3;
38246 }
38247
38248 SDValue NewMul;
38249 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
38250 if (MulAmt2 &&
38251 (isPowerOf2_64(MulAmt2) ||
38252 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
38253
38254 if (isPowerOf2_64(MulAmt2) &&
38255 !(SignMulAmt >= 0 && N->hasOneUse() &&
38256 N->use_begin()->getOpcode() == ISD::ADD))
38257 // If second multiplifer is pow2, issue it first. We want the multiply by
38258 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
38259 // is an add. Only do this for positive multiply amounts since the
38260 // negate would prevent it from being used as an address mode anyway.
38261 std::swap(MulAmt1, MulAmt2);
38262
38263 if (isPowerOf2_64(MulAmt1))
38264 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38265 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
38266 else
38267 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
38268 DAG.getConstant(MulAmt1, DL, VT));
38269
38270 if (isPowerOf2_64(MulAmt2))
38271 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
38272 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
38273 else
38274 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
38275 DAG.getConstant(MulAmt2, DL, VT));
38276
38277 // Negate the result.
38278 if (SignMulAmt < 0)
38279 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
38280 NewMul);
38281 } else if (!Subtarget.slowLEA())
38282 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
38283
38284 if (!NewMul) {
38285 assert(C->getZExtValue() != 0 &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38288, __PRETTY_FUNCTION__))
38286 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38288, __PRETTY_FUNCTION__))
38287 "Both cases that could cause potential overflows should have "((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38288, __PRETTY_FUNCTION__))
38288 "already been handled.")((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38288, __PRETTY_FUNCTION__))
;
38289 if (isPowerOf2_64(AbsMulAmt - 1)) {
38290 // (mul x, 2^N + 1) => (add (shl x, N), x)
38291 NewMul = DAG.getNode(
38292 ISD::ADD, DL, VT, N->getOperand(0),
38293 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38294 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
38295 MVT::i8)));
38296 // To negate, subtract the number from zero
38297 if (SignMulAmt < 0)
38298 NewMul = DAG.getNode(ISD::SUB, DL, VT,
38299 DAG.getConstant(0, DL, VT), NewMul);
38300 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
38301 // (mul x, 2^N - 1) => (sub (shl x, N), x)
38302 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38303 DAG.getConstant(Log2_64(AbsMulAmt + 1),
38304 DL, MVT::i8));
38305 // To negate, reverse the operands of the subtract.
38306 if (SignMulAmt < 0)
38307 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
38308 else
38309 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
38310 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
38311 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
38312 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38313 DAG.getConstant(Log2_64(AbsMulAmt - 2),
38314 DL, MVT::i8));
38315 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
38316 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
38317 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
38318 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
38319 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
38320 DAG.getConstant(Log2_64(AbsMulAmt + 2),
38321 DL, MVT::i8));
38322 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
38323 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
38324 }
38325 }
38326
38327 return NewMul;
38328}
38329
38330static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
38331 SDValue N0 = N->getOperand(0);
38332 SDValue N1 = N->getOperand(1);
38333 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
38334 EVT VT = N0.getValueType();
38335
38336 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
38337 // since the result of setcc_c is all zero's or all ones.
38338 if (VT.isInteger() && !VT.isVector() &&
38339 N1C && N0.getOpcode() == ISD::AND &&
38340 N0.getOperand(1).getOpcode() == ISD::Constant) {
38341 SDValue N00 = N0.getOperand(0);
38342 APInt Mask = N0.getConstantOperandAPInt(1);
38343 Mask <<= N1C->getAPIntValue();
38344 bool MaskOK = false;
38345 // We can handle cases concerning bit-widening nodes containing setcc_c if
38346 // we carefully interrogate the mask to make sure we are semantics
38347 // preserving.
38348 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
38349 // of the underlying setcc_c operation if the setcc_c was zero extended.
38350 // Consider the following example:
38351 // zext(setcc_c) -> i32 0x0000FFFF
38352 // c1 -> i32 0x0000FFFF
38353 // c2 -> i32 0x00000001
38354 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
38355 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
38356 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
38357 MaskOK = true;
38358 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
38359 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
38360 MaskOK = true;
38361 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
38362 N00.getOpcode() == ISD::ANY_EXTEND) &&
38363 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
38364 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
38365 }
38366 if (MaskOK && Mask != 0) {
38367 SDLoc DL(N);
38368 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
38369 }
38370 }
38371
38372 // Hardware support for vector shifts is sparse which makes us scalarize the
38373 // vector operations in many cases. Also, on sandybridge ADD is faster than
38374 // shl.
38375 // (shl V, 1) -> add V,V
38376 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
38377 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
38378 assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38378, __PRETTY_FUNCTION__))
;
38379 // We shift all of the values by one. In many cases we do not have
38380 // hardware support for this operation. This is better expressed as an ADD
38381 // of two values.
38382 if (N1SplatC->getAPIntValue() == 1)
38383 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
38384 }
38385
38386 return SDValue();
38387}
38388
38389static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
38390 SDValue N0 = N->getOperand(0);
38391 SDValue N1 = N->getOperand(1);
38392 EVT VT = N0.getValueType();
38393 unsigned Size = VT.getSizeInBits();
38394
38395 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
38396 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
38397 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
38398 // depending on sign of (SarConst - [56,48,32,24,16])
38399
38400 // sexts in X86 are MOVs. The MOVs have the same code size
38401 // as above SHIFTs (only SHIFT on 1 has lower code size).
38402 // However the MOVs have 2 advantages to a SHIFT:
38403 // 1. MOVs can write to a register that differs from source
38404 // 2. MOVs accept memory operands
38405
38406 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
38407 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
38408 N0.getOperand(1).getOpcode() != ISD::Constant)
38409 return SDValue();
38410
38411 SDValue N00 = N0.getOperand(0);
38412 SDValue N01 = N0.getOperand(1);
38413 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
38414 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
38415 EVT CVT = N1.getValueType();
38416
38417 if (SarConst.isNegative())
38418 return SDValue();
38419
38420 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
38421 unsigned ShiftSize = SVT.getSizeInBits();
38422 // skipping types without corresponding sext/zext and
38423 // ShlConst that is not one of [56,48,32,24,16]
38424 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
38425 continue;
38426 SDLoc DL(N);
38427 SDValue NN =
38428 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
38429 SarConst = SarConst - (Size - ShiftSize);
38430 if (SarConst == 0)
38431 return NN;
38432 else if (SarConst.isNegative())
38433 return DAG.getNode(ISD::SHL, DL, VT, NN,
38434 DAG.getConstant(-SarConst, DL, CVT));
38435 else
38436 return DAG.getNode(ISD::SRA, DL, VT, NN,
38437 DAG.getConstant(SarConst, DL, CVT));
38438 }
38439 return SDValue();
38440}
38441
38442static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
38443 TargetLowering::DAGCombinerInfo &DCI) {
38444 SDValue N0 = N->getOperand(0);
38445 SDValue N1 = N->getOperand(1);
38446 EVT VT = N0.getValueType();
38447
38448 // Only do this on the last DAG combine as it can interfere with other
38449 // combines.
38450 if (!DCI.isAfterLegalizeDAG())
38451 return SDValue();
38452
38453 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
38454 // TODO: This is a generic DAG combine that became an x86-only combine to
38455 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
38456 // and-not ('andn').
38457 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
38458 return SDValue();
38459
38460 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
38461 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
38462 if (!ShiftC || !AndC)
38463 return SDValue();
38464
38465 // If we can shrink the constant mask below 8-bits or 32-bits, then this
38466 // transform should reduce code size. It may also enable secondary transforms
38467 // from improved known-bits analysis or instruction selection.
38468 APInt MaskVal = AndC->getAPIntValue();
38469
38470 // If this can be matched by a zero extend, don't optimize.
38471 if (MaskVal.isMask()) {
38472 unsigned TO = MaskVal.countTrailingOnes();
38473 if (TO >= 8 && isPowerOf2_32(TO))
38474 return SDValue();
38475 }
38476
38477 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
38478 unsigned OldMaskSize = MaskVal.getMinSignedBits();
38479 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
38480 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
38481 (OldMaskSize > 32 && NewMaskSize <= 32)) {
38482 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
38483 SDLoc DL(N);
38484 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
38485 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
38486 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
38487 }
38488 return SDValue();
38489}
38490
38491static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
38492 TargetLowering::DAGCombinerInfo &DCI,
38493 const X86Subtarget &Subtarget) {
38494 unsigned Opcode = N->getOpcode();
38495 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38496, __PRETTY_FUNCTION__))
38496 "Unexpected shift opcode")(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38496, __PRETTY_FUNCTION__))
;
38497
38498 EVT VT = N->getValueType(0);
38499 SDValue N0 = N->getOperand(0);
38500 SDValue N1 = N->getOperand(1);
38501 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
38502 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
38503 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38505, __PRETTY_FUNCTION__))
38504 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38505, __PRETTY_FUNCTION__))
38505 "Unexpected PACKSS/PACKUS input type")((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38505, __PRETTY_FUNCTION__))
;
38506
38507 bool IsSigned = (X86ISD::PACKSS == Opcode);
38508
38509 // Constant Folding.
38510 APInt UndefElts0, UndefElts1;
38511 SmallVector<APInt, 32> EltBits0, EltBits1;
38512 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
38513 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
38514 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
38515 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
38516 unsigned NumLanes = VT.getSizeInBits() / 128;
38517 unsigned NumDstElts = VT.getVectorNumElements();
38518 unsigned NumSrcElts = NumDstElts / 2;
38519 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
38520 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
38521
38522 APInt Undefs(NumDstElts, 0);
38523 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
38524 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
38525 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
38526 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
38527 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
38528 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
38529
38530 if (UndefElts[SrcIdx]) {
38531 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
38532 continue;
38533 }
38534
38535 APInt &Val = EltBits[SrcIdx];
38536 if (IsSigned) {
38537 // PACKSS: Truncate signed value with signed saturation.
38538 // Source values less than dst minint are saturated to minint.
38539 // Source values greater than dst maxint are saturated to maxint.
38540 if (Val.isSignedIntN(DstBitsPerElt))
38541 Val = Val.trunc(DstBitsPerElt);
38542 else if (Val.isNegative())
38543 Val = APInt::getSignedMinValue(DstBitsPerElt);
38544 else
38545 Val = APInt::getSignedMaxValue(DstBitsPerElt);
38546 } else {
38547 // PACKUS: Truncate signed value with unsigned saturation.
38548 // Source values less than zero are saturated to zero.
38549 // Source values greater than dst maxuint are saturated to maxuint.
38550 if (Val.isIntN(DstBitsPerElt))
38551 Val = Val.trunc(DstBitsPerElt);
38552 else if (Val.isNegative())
38553 Val = APInt::getNullValue(DstBitsPerElt);
38554 else
38555 Val = APInt::getAllOnesValue(DstBitsPerElt);
38556 }
38557 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
38558 }
38559 }
38560
38561 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
38562 }
38563
38564 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
38565 // truncate to create a larger truncate.
38566 if (Subtarget.hasAVX512() &&
38567 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
38568 N0.getOperand(0).getValueType() == MVT::v8i32) {
38569 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
38570 (!IsSigned &&
38571 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
38572 if (Subtarget.hasVLX())
38573 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
38574
38575 // Widen input to v16i32 so we can truncate that.
38576 SDLoc dl(N);
38577 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
38578 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
38579 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
38580 }
38581 }
38582
38583 // Attempt to combine as shuffle.
38584 SDValue Op(N, 0);
38585 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38586 return Res;
38587
38588 return SDValue();
38589}
38590
38591static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
38592 TargetLowering::DAGCombinerInfo &DCI,
38593 const X86Subtarget &Subtarget) {
38594 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38596, __PRETTY_FUNCTION__))
38595 X86ISD::VSRL == N->getOpcode()) &&(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38596, __PRETTY_FUNCTION__))
38596 "Unexpected shift opcode")(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38596, __PRETTY_FUNCTION__))
;
38597 EVT VT = N->getValueType(0);
38598 SDValue N0 = N->getOperand(0);
38599 SDValue N1 = N->getOperand(1);
38600
38601 // Shift zero -> zero.
38602 if (ISD::isBuildVectorAllZeros(N0.getNode()))
38603 return DAG.getConstant(0, SDLoc(N), VT);
38604
38605 // Detect constant shift amounts.
38606 APInt UndefElts;
38607 SmallVector<APInt, 32> EltBits;
38608 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
38609 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
38610 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
38611 EltBits[0].getZExtValue(), DAG);
38612 }
38613
38614 APInt KnownUndef, KnownZero;
38615 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38616 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38617 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
38618 KnownZero, DCI))
38619 return SDValue(N, 0);
38620
38621 return SDValue();
38622}
38623
38624static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
38625 TargetLowering::DAGCombinerInfo &DCI,
38626 const X86Subtarget &Subtarget) {
38627 unsigned Opcode = N->getOpcode();
38628 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38630, __PRETTY_FUNCTION__))
38629 X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38630, __PRETTY_FUNCTION__))
38630 "Unexpected shift opcode")(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38630, __PRETTY_FUNCTION__))
;
38631 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
38632 EVT VT = N->getValueType(0);
38633 SDValue N0 = N->getOperand(0);
38634 SDValue N1 = N->getOperand(1);
38635 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
38636 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38637, __PRETTY_FUNCTION__))
38637 "Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38637, __PRETTY_FUNCTION__))
;
38638 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")((N1.getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38638, __PRETTY_FUNCTION__))
;
38639
38640 // Out of range logical bit shifts are guaranteed to be zero.
38641 // Out of range arithmetic bit shifts splat the sign bit.
38642 unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
38643 if (ShiftVal >= NumBitsPerElt) {
38644 if (LogicalShift)
38645 return DAG.getConstant(0, SDLoc(N), VT);
38646 else
38647 ShiftVal = NumBitsPerElt - 1;
38648 }
38649
38650 // Shift N0 by zero -> N0.
38651 if (!ShiftVal)
38652 return N0;
38653
38654 // Shift zero -> zero.
38655 if (ISD::isBuildVectorAllZeros(N0.getNode()))
38656 return DAG.getConstant(0, SDLoc(N), VT);
38657
38658 // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
38659 // clamped to (NumBitsPerElt - 1).
38660 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
38661 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
38662 unsigned NewShiftVal = ShiftVal + ShiftVal2;
38663 if (NewShiftVal >= NumBitsPerElt)
38664 NewShiftVal = NumBitsPerElt - 1;
38665 return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
38666 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
38667 }
38668
38669 // We can decode 'whole byte' logical bit shifts as shuffles.
38670 if (LogicalShift && (ShiftVal % 8) == 0) {
38671 SDValue Op(N, 0);
38672 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38673 return Res;
38674 }
38675
38676 // Constant Folding.
38677 APInt UndefElts;
38678 SmallVector<APInt, 32> EltBits;
38679 if (N->isOnlyUserOf(N0.getNode()) &&
38680 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
38681 assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38682, __PRETTY_FUNCTION__))
38682 "Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38682, __PRETTY_FUNCTION__))
;
38683 for (APInt &Elt : EltBits) {
38684 if (X86ISD::VSHLI == Opcode)
38685 Elt <<= ShiftVal;
38686 else if (X86ISD::VSRAI == Opcode)
38687 Elt.ashrInPlace(ShiftVal);
38688 else
38689 Elt.lshrInPlace(ShiftVal);
38690 }
38691 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
38692 }
38693
38694 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38695 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
38696 APInt::getAllOnesValue(NumBitsPerElt), DCI))
38697 return SDValue(N, 0);
38698
38699 return SDValue();
38700}
38701
38702static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
38703 TargetLowering::DAGCombinerInfo &DCI,
38704 const X86Subtarget &Subtarget) {
38705 EVT VT = N->getValueType(0);
38706 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38708, __PRETTY_FUNCTION__))
38707 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38708, __PRETTY_FUNCTION__))
38708 "Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38708, __PRETTY_FUNCTION__))
;
38709
38710 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
38711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38712 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
38713 APInt::getAllOnesValue(NumBitsPerElt), DCI))
38714 return SDValue(N, 0);
38715
38716 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
38717 SDValue Op(N, 0);
38718 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38719 return Res;
38720
38721 return SDValue();
38722}
38723
38724/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
38725/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
38726/// OR -> CMPNEQSS.
38727static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
38728 TargetLowering::DAGCombinerInfo &DCI,
38729 const X86Subtarget &Subtarget) {
38730 unsigned opcode;
38731
38732 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
38733 // we're requiring SSE2 for both.
38734 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
38735 SDValue N0 = N->getOperand(0);
38736 SDValue N1 = N->getOperand(1);
38737 SDValue CMP0 = N0.getOperand(1);
38738 SDValue CMP1 = N1.getOperand(1);
38739 SDLoc DL(N);
38740
38741 // The SETCCs should both refer to the same CMP.
38742 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
38743 return SDValue();
38744
38745 SDValue CMP00 = CMP0->getOperand(0);
38746 SDValue CMP01 = CMP0->getOperand(1);
38747 EVT VT = CMP00.getValueType();
38748
38749 if (VT == MVT::f32 || VT == MVT::f64) {
38750 bool ExpectingFlags = false;
38751 // Check for any users that want flags:
38752 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
38753 !ExpectingFlags && UI != UE; ++UI)
38754 switch (UI->getOpcode()) {
38755 default:
38756 case ISD::BR_CC:
38757 case ISD::BRCOND:
38758 case ISD::SELECT:
38759 ExpectingFlags = true;
38760 break;
38761 case ISD::CopyToReg:
38762 case ISD::SIGN_EXTEND:
38763 case ISD::ZERO_EXTEND:
38764 case ISD::ANY_EXTEND:
38765 break;
38766 }
38767
38768 if (!ExpectingFlags) {
38769 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
38770 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
38771
38772 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
38773 X86::CondCode tmp = cc0;
38774 cc0 = cc1;
38775 cc1 = tmp;
38776 }
38777
38778 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
38779 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
38780 // FIXME: need symbolic constants for these magic numbers.
38781 // See X86ATTInstPrinter.cpp:printSSECC().
38782 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
38783 if (Subtarget.hasAVX512()) {
38784 SDValue FSetCC =
38785 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
38786 DAG.getTargetConstant(x86cc, DL, MVT::i8));
38787 // Need to fill with zeros to ensure the bitcast will produce zeroes
38788 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
38789 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
38790 DAG.getConstant(0, DL, MVT::v16i1),
38791 FSetCC, DAG.getIntPtrConstant(0, DL));
38792 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
38793 N->getSimpleValueType(0));
38794 }
38795 SDValue OnesOrZeroesF =
38796 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
38797 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
38798
38799 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
38800 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
38801
38802 if (is64BitFP && !Subtarget.is64Bit()) {
38803 // On a 32-bit target, we cannot bitcast the 64-bit float to a
38804 // 64-bit integer, since that's not a legal type. Since
38805 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
38806 // bits, but can do this little dance to extract the lowest 32 bits
38807 // and work with those going forward.
38808 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
38809 OnesOrZeroesF);
38810 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
38811 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
38812 Vector32, DAG.getIntPtrConstant(0, DL));
38813 IntVT = MVT::i32;
38814 }
38815
38816 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
38817 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
38818 DAG.getConstant(1, DL, IntVT));
38819 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
38820 ANDed);
38821 return OneBitOfTruth;
38822 }
38823 }
38824 }
38825 }
38826 return SDValue();
38827}
38828
38829/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
38830static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
38831 assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0
) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38831, __PRETTY_FUNCTION__))
;
38832
38833 MVT VT = N->getSimpleValueType(0);
38834 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38835 return SDValue();
38836
38837 SDValue X, Y;
38838 SDValue N0 = N->getOperand(0);
38839 SDValue N1 = N->getOperand(1);
38840
38841 if (SDValue Not = IsNOT(N0, DAG)) {
38842 X = Not;
38843 Y = N1;
38844 } else if (SDValue Not = IsNOT(N1, DAG)) {
38845 X = Not;
38846 Y = N0;
38847 } else
38848 return SDValue();
38849
38850 X = DAG.getBitcast(VT, X);
38851 Y = DAG.getBitcast(VT, Y);
38852 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
38853}
38854
38855// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
38856// register. In most cases we actually compare or select YMM-sized registers
38857// and mixing the two types creates horrible code. This method optimizes
38858// some of the transition sequences.
38859// Even with AVX-512 this is still useful for removing casts around logical
38860// operations on vXi1 mask types.
38861static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
38862 const X86Subtarget &Subtarget) {
38863 EVT VT = N->getValueType(0);
38864 assert(VT.isVector() && "Expected vector type")((VT.isVector() && "Expected vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38864, __PRETTY_FUNCTION__))
;
38865
38866 assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38868, __PRETTY_FUNCTION__))
38867 N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38868, __PRETTY_FUNCTION__))
38868 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38868, __PRETTY_FUNCTION__))
;
38869
38870 SDValue Narrow = N->getOperand(0);
38871 EVT NarrowVT = Narrow.getValueType();
38872
38873 if (Narrow->getOpcode() != ISD::XOR &&
38874 Narrow->getOpcode() != ISD::AND &&
38875 Narrow->getOpcode() != ISD::OR)
38876 return SDValue();
38877
38878 SDValue N0 = Narrow->getOperand(0);
38879 SDValue N1 = Narrow->getOperand(1);
38880 SDLoc DL(Narrow);
38881
38882 // The Left side has to be a trunc.
38883 if (N0.getOpcode() != ISD::TRUNCATE)
38884 return SDValue();
38885
38886 // The type of the truncated inputs.
38887 if (N0.getOperand(0).getValueType() != VT)
38888 return SDValue();
38889
38890 // The right side has to be a 'trunc' or a constant vector.
38891 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
38892 N1.getOperand(0).getValueType() == VT;
38893 if (!RHSTrunc &&
38894 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
38895 return SDValue();
38896
38897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38898
38899 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
38900 return SDValue();
38901
38902 // Set N0 and N1 to hold the inputs to the new wide operation.
38903 N0 = N0.getOperand(0);
38904 if (RHSTrunc)
38905 N1 = N1.getOperand(0);
38906 else
38907 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
38908
38909 // Generate the wide operation.
38910 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
38911 unsigned Opcode = N->getOpcode();
38912 switch (Opcode) {
38913 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38913)
;
38914 case ISD::ANY_EXTEND:
38915 return Op;
38916 case ISD::ZERO_EXTEND:
38917 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
38918 case ISD::SIGN_EXTEND:
38919 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
38920 Op, DAG.getValueType(NarrowVT));
38921 }
38922}
38923
38924/// If both input operands of a logic op are being cast from floating point
38925/// types, try to convert this into a floating point logic node to avoid
38926/// unnecessary moves from SSE to integer registers.
38927static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
38928 const X86Subtarget &Subtarget) {
38929 EVT VT = N->getValueType(0);
38930 SDValue N0 = N->getOperand(0);
38931 SDValue N1 = N->getOperand(1);
38932 SDLoc DL(N);
38933
38934 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
38935 return SDValue();
38936
38937 SDValue N00 = N0.getOperand(0);
38938 SDValue N10 = N1.getOperand(0);
38939 EVT N00Type = N00.getValueType();
38940 EVT N10Type = N10.getValueType();
38941
38942 // Ensure that both types are the same and are legal scalar fp types.
38943 if (N00Type != N10Type ||
38944 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
38945 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
38946 return SDValue();
38947
38948 unsigned FPOpcode;
38949 switch (N->getOpcode()) {
38950 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38950)
;
38951 case ISD::AND: FPOpcode = X86ISD::FAND; break;
38952 case ISD::OR: FPOpcode = X86ISD::FOR; break;
38953 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
38954 }
38955
38956 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
38957 return DAG.getBitcast(VT, FPLogic);
38958}
38959
38960/// If this is a zero/all-bits result that is bitwise-anded with a low bits
38961/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
38962/// with a shift-right to eliminate loading the vector constant mask value.
38963static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
38964 const X86Subtarget &Subtarget) {
38965 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
38966 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
38967 EVT VT0 = Op0.getValueType();
38968 EVT VT1 = Op1.getValueType();
38969
38970 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
38971 return SDValue();
38972
38973 APInt SplatVal;
38974 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
38975 !SplatVal.isMask())
38976 return SDValue();
38977
38978 // Don't prevent creation of ANDN.
38979 if (isBitwiseNot(Op0))
38980 return SDValue();
38981
38982 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
38983 return SDValue();
38984
38985 unsigned EltBitWidth = VT0.getScalarSizeInBits();
38986 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
38987 return SDValue();
38988
38989 SDLoc DL(N);
38990 unsigned ShiftVal = SplatVal.countTrailingOnes();
38991 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
38992 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
38993 return DAG.getBitcast(N->getValueType(0), Shift);
38994}
38995
38996// Get the index node from the lowered DAG of a GEP IR instruction with one
38997// indexing dimension.
38998static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
38999 if (Ld->isIndexed())
39000 return SDValue();
39001
39002 SDValue Base = Ld->getBasePtr();
39003
39004 if (Base.getOpcode() != ISD::ADD)
39005 return SDValue();
39006
39007 SDValue ShiftedIndex = Base.getOperand(0);
39008
39009 if (ShiftedIndex.getOpcode() != ISD::SHL)
39010 return SDValue();
39011
39012 return ShiftedIndex.getOperand(0);
39013
39014}
39015
39016static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
39017 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
39018 switch (VT.getSizeInBits()) {
39019 default: return false;
39020 case 64: return Subtarget.is64Bit() ? true : false;
39021 case 32: return true;
39022 }
39023 }
39024 return false;
39025}
39026
39027// This function recognizes cases where X86 bzhi instruction can replace and
39028// 'and-load' sequence.
39029// In case of loading integer value from an array of constants which is defined
39030// as follows:
39031//
39032// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
39033//
39034// then applying a bitwise and on the result with another input.
39035// It's equivalent to performing bzhi (zero high bits) on the input, with the
39036// same index of the load.
39037static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
39038 const X86Subtarget &Subtarget) {
39039 MVT VT = Node->getSimpleValueType(0);
39040 SDLoc dl(Node);
39041
39042 // Check if subtarget has BZHI instruction for the node's type
39043 if (!hasBZHI(Subtarget, VT))
39044 return SDValue();
39045
39046 // Try matching the pattern for both operands.
39047 for (unsigned i = 0; i < 2; i++) {
39048 SDValue N = Node->getOperand(i);
39049 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
39050
39051 // continue if the operand is not a load instruction
39052 if (!Ld)
39053 return SDValue();
39054
39055 const Value *MemOp = Ld->getMemOperand()->getValue();
39056
39057 if (!MemOp)
39058 return SDValue();
39059
39060 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
39061 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
39062 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
39063
39064 Constant *Init = GV->getInitializer();
39065 Type *Ty = Init->getType();
39066 if (!isa<ConstantDataArray>(Init) ||
39067 !Ty->getArrayElementType()->isIntegerTy() ||
39068 Ty->getArrayElementType()->getScalarSizeInBits() !=
39069 VT.getSizeInBits() ||
39070 Ty->getArrayNumElements() >
39071 Ty->getArrayElementType()->getScalarSizeInBits())
39072 continue;
39073
39074 // Check if the array's constant elements are suitable to our case.
39075 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
39076 bool ConstantsMatch = true;
39077 for (uint64_t j = 0; j < ArrayElementCount; j++) {
39078 ConstantInt *Elem =
39079 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
39080 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
39081 ConstantsMatch = false;
39082 break;
39083 }
39084 }
39085 if (!ConstantsMatch)
39086 continue;
39087
39088 // Do the transformation (For 32-bit type):
39089 // -> (and (load arr[idx]), inp)
39090 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
39091 // that will be replaced with one bzhi instruction.
39092 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
39093 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
39094
39095 // Get the Node which indexes into the array.
39096 SDValue Index = getIndexFromUnindexedLoad(Ld);
39097 if (!Index)
39098 return SDValue();
39099 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
39100
39101 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
39102 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
39103
39104 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
39105 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
39106
39107 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
39108 }
39109 }
39110 }
39111 }
39112 return SDValue();
39113}
39114
39115// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
39116// Turn it into series of XORs and a setnp.
39117static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
39118 const X86Subtarget &Subtarget) {
39119 EVT VT = N->getValueType(0);
39120
39121 // We only support 64-bit and 32-bit. 64-bit requires special handling
39122 // unless the 64-bit popcnt instruction is legal.
39123 if (VT != MVT::i32 && VT != MVT::i64)
39124 return SDValue();
39125
39126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39127 if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
39128 return SDValue();
39129
39130 SDValue N0 = N->getOperand(0);
39131 SDValue N1 = N->getOperand(1);
39132
39133 // LHS needs to be a single use CTPOP.
39134 if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
39135 return SDValue();
39136
39137 // RHS needs to be 1.
39138 if (!isOneConstant(N1))
39139 return SDValue();
39140
39141 SDLoc DL(N);
39142 SDValue X = N0.getOperand(0);
39143
39144 // If this is 64-bit, its always best to xor the two 32-bit pieces together
39145 // even if we have popcnt.
39146 if (VT == MVT::i64) {
39147 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
39148 DAG.getNode(ISD::SRL, DL, VT, X,
39149 DAG.getConstant(32, DL, MVT::i8)));
39150 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
39151 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
39152 // Generate a 32-bit parity idiom. This will bring us back here if we need
39153 // to expand it too.
39154 SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
39155 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
39156 DAG.getConstant(1, DL, MVT::i32));
39157 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
39158 }
39159 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39159, __PRETTY_FUNCTION__))
;
39160
39161 // Xor the high and low 16-bits together using a 32-bit operation.
39162 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
39163 DAG.getConstant(16, DL, MVT::i8));
39164 X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
39165
39166 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
39167 // This should allow an h-reg to be used to save a shift.
39168 // FIXME: We only get an h-reg in 32-bit mode.
39169 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
39170 DAG.getNode(ISD::SRL, DL, VT, X,
39171 DAG.getConstant(8, DL, MVT::i8)));
39172 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
39173 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
39174 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
39175
39176 // Copy the inverse of the parity flag into a register with setcc.
39177 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
39178 // Zero extend to original type.
39179 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
39180}
39181
39182
39183// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
39184// Where C is a mask containing the same number of bits as the setcc and
39185// where the setcc will freely 0 upper bits of k-register. We can replace the
39186// undef in the concat with 0s and remove the AND. This mainly helps with
39187// v2i1/v4i1 setcc being casted to scalar.
39188static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
39189 const X86Subtarget &Subtarget) {
39190 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((N->getOpcode() == ISD::AND && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39190, __PRETTY_FUNCTION__))
;
39191
39192 EVT VT = N->getValueType(0);
39193
39194 // Make sure this is an AND with constant. We will check the value of the
39195 // constant later.
39196 if (!isa<ConstantSDNode>(N->getOperand(1)))
39197 return SDValue();
39198
39199 // This is implied by the ConstantSDNode.
39200 assert(!VT.isVector() && "Expected scalar VT!")((!VT.isVector() && "Expected scalar VT!") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39200, __PRETTY_FUNCTION__))
;
39201
39202 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
39203 !N->getOperand(0).hasOneUse() ||
39204 !N->getOperand(0).getOperand(0).hasOneUse())
39205 return SDValue();
39206
39207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39208 SDValue Src = N->getOperand(0).getOperand(0);
39209 EVT SrcVT = Src.getValueType();
39210 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
39211 !TLI.isTypeLegal(SrcVT))
39212 return SDValue();
39213
39214 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
39215 return SDValue();
39216
39217 // We only care about the first subvector of the concat, we expect the
39218 // other subvectors to be ignored due to the AND if we make the change.
39219 SDValue SubVec = Src.getOperand(0);
39220 EVT SubVecVT = SubVec.getValueType();
39221
39222 // First subvector should be a setcc with a legal result type. The RHS of the
39223 // AND should be a mask with this many bits.
39224 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
39225 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
39226 return SDValue();
39227
39228 EVT SetccVT = SubVec.getOperand(0).getValueType();
39229 if (!TLI.isTypeLegal(SetccVT) ||
39230 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
39231 return SDValue();
39232
39233 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
39234 return SDValue();
39235
39236 // We passed all the checks. Rebuild the concat_vectors with zeroes
39237 // and cast it back to VT.
39238 SDLoc dl(N);
39239 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
39240 DAG.getConstant(0, dl, SubVecVT));
39241 Ops[0] = SubVec;
39242 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
39243 Ops);
39244 return DAG.getBitcast(VT, Concat);
39245}
39246
39247static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
39248 TargetLowering::DAGCombinerInfo &DCI,
39249 const X86Subtarget &Subtarget) {
39250 EVT VT = N->getValueType(0);
39251
39252 // If this is SSE1 only convert to FAND to avoid scalarization.
39253 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
39254 return DAG.getBitcast(
39255 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
39256 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
39257 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
39258 }
39259
39260 // Use a 32-bit and+zext if upper bits known zero.
39261 if (VT == MVT::i64 && Subtarget.is64Bit() &&
39262 !isa<ConstantSDNode>(N->getOperand(1))) {
39263 APInt HiMask = APInt::getHighBitsSet(64, 32);
39264 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
39265 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
39266 SDLoc dl(N);
39267 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
39268 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
39269 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
39270 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
39271 }
39272 }
39273
39274 // This must be done before legalization has expanded the ctpop.
39275 if (SDValue V = combineParity(N, DAG, Subtarget))
39276 return V;
39277
39278 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
39279 // TODO: Support multiple SrcOps.
39280 if (VT == MVT::i1) {
39281 SmallVector<SDValue, 2> SrcOps;
39282 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
39283 SrcOps.size() == 1) {
39284 SDLoc dl(N);
39285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39286 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
39287 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
39288 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
39289 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
39290 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
39291 if (Mask) {
39292 APInt AllBits = APInt::getAllOnesValue(NumElts);
39293 return DAG.getSetCC(dl, MVT::i1, Mask,
39294 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
39295 }
39296 }
39297 }
39298
39299 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
39300 return V;
39301
39302 if (DCI.isBeforeLegalizeOps())
39303 return SDValue();
39304
39305 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
39306 return R;
39307
39308 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
39309 return FPLogic;
39310
39311 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
39312 return R;
39313
39314 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
39315 return ShiftRight;
39316
39317 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
39318 return R;
39319
39320 // Attempt to recursively combine a bitmask AND with shuffles.
39321 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
39322 SDValue Op(N, 0);
39323 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
39324 return Res;
39325 }
39326
39327 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
39328 if ((VT.getScalarSizeInBits() % 8) == 0 &&
39329 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39330 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
39331 SDValue BitMask = N->getOperand(1);
39332 SDValue SrcVec = N->getOperand(0).getOperand(0);
39333 EVT SrcVecVT = SrcVec.getValueType();
39334
39335 // Check that the constant bitmask masks whole bytes.
39336 APInt UndefElts;
39337 SmallVector<APInt, 64> EltBits;
39338 if (VT == SrcVecVT.getScalarType() &&
39339 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
39340 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
39341 llvm::all_of(EltBits, [](APInt M) {
39342 return M.isNullValue() || M.isAllOnesValue();
39343 })) {
39344 unsigned NumElts = SrcVecVT.getVectorNumElements();
39345 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
39346 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
39347
39348 // Create a root shuffle mask from the byte mask and the extracted index.
39349 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
39350 for (unsigned i = 0; i != Scale; ++i) {
39351 if (UndefElts[i])
39352 continue;
39353 int VecIdx = Scale * Idx + i;
39354 ShuffleMask[VecIdx] =
39355 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
39356 }
39357
39358 if (SDValue Shuffle = combineX86ShufflesRecursively(
39359 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
39360 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
39361 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
39362 N->getOperand(0).getOperand(1));
39363 }
39364 }
39365
39366 return SDValue();
39367}
39368
39369// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
39370static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
39371 const X86Subtarget &Subtarget) {
39372 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39372, __PRETTY_FUNCTION__))
;
39373
39374 MVT VT = N->getSimpleValueType(0);
39375 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
39376 return SDValue();
39377
39378 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
39379 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
39380 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
39381 return SDValue();
39382
39383 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
39384 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
39385 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
39386 Subtarget.hasVLX();
39387 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
39388 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
39389 return SDValue();
39390
39391 // Attempt to extract constant byte masks.
39392 APInt UndefElts0, UndefElts1;
39393 SmallVector<APInt, 32> EltBits0, EltBits1;
39394 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
39395 false, false))
39396 return SDValue();
39397 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
39398 false, false))
39399 return SDValue();
39400
39401 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
39402 // TODO - add UNDEF elts support.
39403 if (UndefElts0[i] || UndefElts1[i])
39404 return SDValue();
39405 if (EltBits0[i] != ~EltBits1[i])
39406 return SDValue();
39407 }
39408
39409 SDLoc DL(N);
39410 SDValue X = N->getOperand(0);
39411 SDValue Y =
39412 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
39413 DAG.getBitcast(VT, N1.getOperand(0)));
39414 return DAG.getNode(ISD::OR, DL, VT, X, Y);
39415}
39416
39417// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
39418static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
39419 if (N->getOpcode() != ISD::OR)
39420 return false;
39421
39422 SDValue N0 = N->getOperand(0);
39423 SDValue N1 = N->getOperand(1);
39424
39425 // Canonicalize AND to LHS.
39426 if (N1.getOpcode() == ISD::AND)
39427 std::swap(N0, N1);
39428
39429 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
39430 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
39431 return false;
39432
39433 Mask = N1.getOperand(0);
39434 X = N1.getOperand(1);
39435
39436 // Check to see if the mask appeared in both the AND and ANDNP.
39437 if (N0.getOperand(0) == Mask)
39438 Y = N0.getOperand(1);
39439 else if (N0.getOperand(1) == Mask)
39440 Y = N0.getOperand(0);
39441 else
39442 return false;
39443
39444 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
39445 // ANDNP combine allows other combines to happen that prevent matching.
39446 return true;
39447}
39448
39449// Try to match:
39450// (or (and (M, (sub 0, X)), (pandn M, X)))
39451// which is a special case of vselect:
39452// (vselect M, (sub 0, X), X)
39453// Per:
39454// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
39455// We know that, if fNegate is 0 or 1:
39456// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
39457//
39458// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
39459// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
39460// ( M ? -X : X) == ((X ^ M ) + (M & 1))
39461// This lets us transform our vselect to:
39462// (add (xor X, M), (and M, 1))
39463// And further to:
39464// (sub (xor X, M), M)
39465static SDValue combineLogicBlendIntoConditionalNegate(
39466 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
39467 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
39468 EVT MaskVT = Mask.getValueType();
39469 assert(MaskVT.isInteger() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39471, __PRETTY_FUNCTION__))
39470 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39471, __PRETTY_FUNCTION__))
39471 "Mask must be zero/all-bits")((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39471, __PRETTY_FUNCTION__))
;
39472
39473 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
39474 return SDValue();
39475 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
39476 return SDValue();
39477
39478 auto IsNegV = [](SDNode *N, SDValue V) {
39479 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
39480 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
39481 };
39482
39483 SDValue V;
39484 if (IsNegV(Y.getNode(), X))
39485 V = X;
39486 else if (IsNegV(X.getNode(), Y))
39487 V = Y;
39488 else
39489 return SDValue();
39490
39491 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
39492 SDValue SubOp2 = Mask;
39493
39494 // If the negate was on the false side of the select, then
39495 // the operands of the SUB need to be swapped. PR 27251.
39496 // This is because the pattern being matched above is
39497 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
39498 // but if the pattern matched was
39499 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
39500 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
39501 // pattern also needs to be a negation of the replacement pattern above.
39502 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
39503 // sub accomplishes the negation of the replacement pattern.
39504 if (V == Y)
39505 std::swap(SubOp1, SubOp2);
39506
39507 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
39508 return DAG.getBitcast(VT, Res);
39509}
39510
39511// Try to fold:
39512// (or (and (m, y), (pandn m, x)))
39513// into:
39514// (vselect m, x, y)
39515// As a special case, try to fold:
39516// (or (and (m, (sub 0, x)), (pandn m, x)))
39517// into:
39518// (sub (xor X, M), M)
39519static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
39520 const X86Subtarget &Subtarget) {
39521 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39521, __PRETTY_FUNCTION__))
;
39522
39523 EVT VT = N->getValueType(0);
39524 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
39525 (VT.is256BitVector() && Subtarget.hasInt256())))
39526 return SDValue();
39527
39528 SDValue X, Y, Mask;
39529 if (!matchLogicBlend(N, X, Y, Mask))
39530 return SDValue();
39531
39532 // Validate that X, Y, and Mask are bitcasts, and see through them.
39533 Mask = peekThroughBitcasts(Mask);
39534 X = peekThroughBitcasts(X);
39535 Y = peekThroughBitcasts(Y);
39536
39537 EVT MaskVT = Mask.getValueType();
39538 unsigned EltBits = MaskVT.getScalarSizeInBits();
39539
39540 // TODO: Attempt to handle floating point cases as well?
39541 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
39542 return SDValue();
39543
39544 SDLoc DL(N);
39545
39546 // Attempt to combine to conditional negate: (sub (xor X, M), M)
39547 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
39548 DAG, Subtarget))
39549 return Res;
39550
39551 // PBLENDVB is only available on SSE 4.1.
39552 if (!Subtarget.hasSSE41())
39553 return SDValue();
39554
39555 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
39556
39557 X = DAG.getBitcast(BlendVT, X);
39558 Y = DAG.getBitcast(BlendVT, Y);
39559 Mask = DAG.getBitcast(BlendVT, Mask);
39560 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
39561 return DAG.getBitcast(VT, Mask);
39562}
39563
39564// Helper function for combineOrCmpEqZeroToCtlzSrl
39565// Transforms:
39566// seteq(cmp x, 0)
39567// into:
39568// srl(ctlz x), log2(bitsize(x))
39569// Input pattern is checked by caller.
39570static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
39571 SelectionDAG &DAG) {
39572 SDValue Cmp = Op.getOperand(1);
39573 EVT VT = Cmp.getOperand(0).getValueType();
39574 unsigned Log2b = Log2_32(VT.getSizeInBits());
39575 SDLoc dl(Op);
39576 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
39577 // The result of the shift is true or false, and on X86, the 32-bit
39578 // encoding of shr and lzcnt is more desirable.
39579 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
39580 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
39581 DAG.getConstant(Log2b, dl, MVT::i8));
39582 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
39583}
39584
39585// Try to transform:
39586// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
39587// into:
39588// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
39589// Will also attempt to match more generic cases, eg:
39590// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
39591// Only applies if the target supports the FastLZCNT feature.
39592static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
39593 TargetLowering::DAGCombinerInfo &DCI,
39594 const X86Subtarget &Subtarget) {
39595 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
39596 return SDValue();
39597
39598 auto isORCandidate = [](SDValue N) {
39599 return (N->getOpcode() == ISD::OR && N->hasOneUse());
39600 };
39601
39602 // Check the zero extend is extending to 32-bit or more. The code generated by
39603 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
39604 // instructions to clear the upper bits.
39605 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
39606 !isORCandidate(N->getOperand(0)))
39607 return SDValue();
39608
39609 // Check the node matches: setcc(eq, cmp 0)
39610 auto isSetCCCandidate = [](SDValue N) {
39611 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
39612 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
39613 N->getOperand(1).getOpcode() == X86ISD::CMP &&
39614 isNullConstant(N->getOperand(1).getOperand(1)) &&
39615 N->getOperand(1).getValueType().bitsGE(MVT::i32);
39616 };
39617
39618 SDNode *OR = N->getOperand(0).getNode();
39619 SDValue LHS = OR->getOperand(0);
39620 SDValue RHS = OR->getOperand(1);
39621
39622 // Save nodes matching or(or, setcc(eq, cmp 0)).
39623 SmallVector<SDNode *, 2> ORNodes;
39624 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
39625 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
39626 ORNodes.push_back(OR);
39627 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
39628 LHS = OR->getOperand(0);
39629 RHS = OR->getOperand(1);
39630 }
39631
39632 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
39633 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
39634 !isORCandidate(SDValue(OR, 0)))
39635 return SDValue();
39636
39637 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
39638 // to
39639 // or(srl(ctlz),srl(ctlz)).
39640 // The dag combiner can then fold it into:
39641 // srl(or(ctlz, ctlz)).
39642 EVT VT = OR->getValueType(0);
39643 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
39644 SDValue Ret, NewRHS;
39645 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
39646 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
39647
39648 if (!Ret)
39649 return SDValue();
39650
39651 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
39652 while (ORNodes.size() > 0) {
39653 OR = ORNodes.pop_back_val();
39654 LHS = OR->getOperand(0);
39655 RHS = OR->getOperand(1);
39656 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
39657 if (RHS->getOpcode() == ISD::OR)
39658 std::swap(LHS, RHS);
39659 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
39660 if (!NewRHS)
39661 return SDValue();
39662 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
39663 }
39664
39665 if (Ret)
39666 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
39667
39668 return Ret;
39669}
39670
39671static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
39672 const X86Subtarget &Subtarget) {
39673 assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node")((N->getOpcode() == ISD::OR && "Expected ISD::OR node"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Expected ISD::OR node\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39673, __PRETTY_FUNCTION__))
;
39674 SDValue N0 = N->getOperand(0);
39675 SDValue N1 = N->getOperand(1);
39676 EVT VT = N->getValueType(0);
39677 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39678
39679 if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
39680 !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
39681 return SDValue();
39682
39683 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
39684 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
39685 unsigned Bits = VT.getScalarSizeInBits();
39686
39687 // SHLD/SHRD instructions have lower register pressure, but on some
39688 // platforms they have higher latency than the equivalent
39689 // series of shifts/or that would otherwise be generated.
39690 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
39691 // have higher latencies and we are not optimizing for size.
39692 if (!OptForSize && Subtarget.isSHLDSlow())
39693 return SDValue();
39694
39695 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
39696 std::swap(N0, N1);
39697 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
39698 return SDValue();
39699 if (!N0.hasOneUse() || !N1.hasOneUse())
39700 return SDValue();
39701
39702 EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
39703
39704 SDValue ShAmt0 = N0.getOperand(1);
39705 if (ShAmt0.getValueType() != ShiftVT)
39706 return SDValue();
39707 SDValue ShAmt1 = N1.getOperand(1);
39708 if (ShAmt1.getValueType() != ShiftVT)
39709 return SDValue();
39710
39711 // Peek through any modulo shift masks.
39712 SDValue ShMsk0;
39713 if (ShAmt0.getOpcode() == ISD::AND &&
39714 isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
39715 ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
39716 ShMsk0 = ShAmt0;
39717 ShAmt0 = ShAmt0.getOperand(0);
39718 }
39719 SDValue ShMsk1;
39720 if (ShAmt1.getOpcode() == ISD::AND &&
39721 isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
39722 ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
39723 ShMsk1 = ShAmt1;
39724 ShAmt1 = ShAmt1.getOperand(0);
39725 }
39726
39727 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
39728 ShAmt0 = ShAmt0.getOperand(0);
39729 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
39730 ShAmt1 = ShAmt1.getOperand(0);
39731
39732 SDLoc DL(N);
39733 unsigned Opc = ISD::FSHL;
39734 SDValue Op0 = N0.getOperand(0);
39735 SDValue Op1 = N1.getOperand(0);
39736 if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
39737 Opc = ISD::FSHR;
39738 std::swap(Op0, Op1);
39739 std::swap(ShAmt0, ShAmt1);
39740 std::swap(ShMsk0, ShMsk1);
39741 }
39742
39743 auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
39744 SDValue Amt) {
39745 if (Opc == ISD::FSHR)
39746 std::swap(Op0, Op1);
39747 return DAG.getNode(Opc, DL, VT, Op0, Op1,
39748 DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
39749 };
39750
39751 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
39752 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
39753 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
39754 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
39755 // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
39756 // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
39757 if (ShAmt1.getOpcode() == ISD::SUB) {
39758 SDValue Sum = ShAmt1.getOperand(0);
39759 if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
39760 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
39761 if (ShAmt1Op1.getOpcode() == ISD::AND &&
39762 isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
39763 ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
39764 ShMsk1 = ShAmt1Op1;
39765 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
39766 }
39767 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
39768 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
39769 if ((SumC->getAPIntValue() == Bits ||
39770 (SumC->getAPIntValue() == 0 && ShMsk1)) &&
39771 ShAmt1Op1 == ShAmt0)
39772 return GetFunnelShift(Op0, Op1, ShAmt0);
39773 }
39774 } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
39775 auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
39776 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
39777 return GetFunnelShift(Op0, Op1, ShAmt0);
39778 } else if (ShAmt1.getOpcode() == ISD::XOR) {
39779 SDValue Mask = ShAmt1.getOperand(1);
39780 if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
39781 unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
39782 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
39783 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
39784 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
39785 if (MaskC->getSExtValue() == (Bits - 1) &&
39786 (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
39787 if (Op1.getOpcode() == InnerShift &&
39788 isa<ConstantSDNode>(Op1.getOperand(1)) &&
39789 Op1.getConstantOperandAPInt(1) == 1) {
39790 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
39791 }
39792 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
39793 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
39794 Op1.getOperand(0) == Op1.getOperand(1)) {
39795 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
39796 }
39797 }
39798 }
39799 }
39800
39801 return SDValue();
39802}
39803
39804static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
39805 TargetLowering::DAGCombinerInfo &DCI,
39806 const X86Subtarget &Subtarget) {
39807 SDValue N0 = N->getOperand(0);
39808 SDValue N1 = N->getOperand(1);
39809 EVT VT = N->getValueType(0);
39810
39811 // If this is SSE1 only convert to FOR to avoid scalarization.
39812 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
39813 return DAG.getBitcast(MVT::v4i32,
39814 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
39815 DAG.getBitcast(MVT::v4f32, N0),
39816 DAG.getBitcast(MVT::v4f32, N1)));
39817 }
39818
39819 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
39820 // TODO: Support multiple SrcOps.
39821 if (VT == MVT::i1) {
39822 SmallVector<SDValue, 2> SrcOps;
39823 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
39824 SrcOps.size() == 1) {
39825 SDLoc dl(N);
39826 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39827 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
39828 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
39829 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
39830 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
39831 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
39832 if (Mask) {
39833 APInt AllBits = APInt::getNullValue(NumElts);
39834 return DAG.getSetCC(dl, MVT::i1, Mask,
39835 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
39836 }
39837 }
39838 }
39839
39840 if (DCI.isBeforeLegalizeOps())
39841 return SDValue();
39842
39843 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
39844 return R;
39845
39846 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
39847 return FPLogic;
39848
39849 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
39850 return R;
39851
39852 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
39853 return R;
39854
39855 if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
39856 return R;
39857
39858 // Attempt to recursively combine an OR of shuffles.
39859 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
39860 SDValue Op(N, 0);
39861 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
39862 return Res;
39863 }
39864
39865 return SDValue();
39866}
39867
39868/// Try to turn tests against the signbit in the form of:
39869/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
39870/// into:
39871/// SETGT(X, -1)
39872static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
39873 // This is only worth doing if the output type is i8 or i1.
39874 EVT ResultType = N->getValueType(0);
39875 if (ResultType != MVT::i8 && ResultType != MVT::i1)
39876 return SDValue();
39877
39878 SDValue N0 = N->getOperand(0);
39879 SDValue N1 = N->getOperand(1);
39880
39881 // We should be performing an xor against a truncated shift.
39882 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
39883 return SDValue();
39884
39885 // Make sure we are performing an xor against one.
39886 if (!isOneConstant(N1))
39887 return SDValue();
39888
39889 // SetCC on x86 zero extends so only act on this if it's a logical shift.
39890 SDValue Shift = N0.getOperand(0);
39891 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
39892 return SDValue();
39893
39894 // Make sure we are truncating from one of i16, i32 or i64.
39895 EVT ShiftTy = Shift.getValueType();
39896 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
39897 return SDValue();
39898
39899 // Make sure the shift amount extracts the sign bit.
39900 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
39901 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
39902 return SDValue();
39903
39904 // Create a greater-than comparison against -1.
39905 // N.B. Using SETGE against 0 works but we want a canonical looking
39906 // comparison, using SETGT matches up with what TranslateX86CC.
39907 SDLoc DL(N);
39908 SDValue ShiftOp = Shift.getOperand(0);
39909 EVT ShiftOpTy = ShiftOp.getValueType();
39910 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39911 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
39912 *DAG.getContext(), ResultType);
39913 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
39914 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
39915 if (SetCCResultType != ResultType)
39916 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
39917 return Cond;
39918}
39919
39920/// Turn vector tests of the signbit in the form of:
39921/// xor (sra X, elt_size(X)-1), -1
39922/// into:
39923/// pcmpgt X, -1
39924///
39925/// This should be called before type legalization because the pattern may not
39926/// persist after that.
39927static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
39928 const X86Subtarget &Subtarget) {
39929 EVT VT = N->getValueType(0);
39930 if (!VT.isSimple())
39931 return SDValue();
39932
39933 switch (VT.getSimpleVT().SimpleTy) {
39934 default: return SDValue();
39935 case MVT::v16i8:
39936 case MVT::v8i16:
39937 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
39938 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
39939 case MVT::v32i8:
39940 case MVT::v16i16:
39941 case MVT::v8i32:
39942 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
39943 }
39944
39945 // There must be a shift right algebraic before the xor, and the xor must be a
39946 // 'not' operation.
39947 SDValue Shift = N->getOperand(0);
39948 SDValue Ones = N->getOperand(1);
39949 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
39950 !ISD::isBuildVectorAllOnes(Ones.getNode()))
39951 return SDValue();
39952
39953 // The shift should be smearing the sign bit across each vector element.
39954 auto *ShiftAmt =
39955 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
39956 if (!ShiftAmt ||
39957 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
39958 return SDValue();
39959
39960 // Create a greater-than comparison against -1. We don't use the more obvious
39961 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
39962 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
39963}
39964
39965/// Detect patterns of truncation with unsigned saturation:
39966///
39967/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
39968/// Return the source value x to be truncated or SDValue() if the pattern was
39969/// not matched.
39970///
39971/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
39972/// where C1 >= 0 and C2 is unsigned max of destination type.
39973///
39974/// (truncate (smax (smin (x, C2), C1)) to dest_type)
39975/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
39976///
39977/// These two patterns are equivalent to:
39978/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
39979/// So return the smax(x, C1) value to be truncated or SDValue() if the
39980/// pattern was not matched.
39981static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
39982 const SDLoc &DL) {
39983 EVT InVT = In.getValueType();
39984
39985 // Saturation with truncation. We truncate from InVT to VT.
39986 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39987, __PRETTY_FUNCTION__))
39987 "Unexpected types for truncate operation")((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39987, __PRETTY_FUNCTION__))
;
39988
39989 // Match min/max and return limit value as a parameter.
39990 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
39991 if (V.getOpcode() == Opcode &&
39992 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
39993 return V.getOperand(0);
39994 return SDValue();
39995 };
39996
39997 APInt C1, C2;
39998 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
39999 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
40000 // the element size of the destination type.
40001 if (C2.isMask(VT.getScalarSizeInBits()))
40002 return UMin;
40003
40004 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
40005 if (MatchMinMax(SMin, ISD::SMAX, C1))
40006 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
40007 return SMin;
40008
40009 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
40010 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
40011 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
40012 C2.uge(C1)) {
40013 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
40014 }
40015
40016 return SDValue();
40017}
40018
40019/// Detect patterns of truncation with signed saturation:
40020/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
40021/// signed_max_of_dest_type)) to dest_type)
40022/// or:
40023/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
40024/// signed_min_of_dest_type)) to dest_type).
40025/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
40026/// Return the source value to be truncated or SDValue() if the pattern was not
40027/// matched.
40028static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
40029 unsigned NumDstBits = VT.getScalarSizeInBits();
40030 unsigned NumSrcBits = In.getScalarValueSizeInBits();
40031 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((NumSrcBits > NumDstBits && "Unexpected types for truncate operation"
) ? static_cast<void> (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40031, __PRETTY_FUNCTION__))
;
40032
40033 auto MatchMinMax = [](SDValue V, unsigned Opcode,
40034 const APInt &Limit) -> SDValue {
40035 APInt C;
40036 if (V.getOpcode() == Opcode &&
40037 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
40038 return V.getOperand(0);
40039 return SDValue();
40040 };
40041
40042 APInt SignedMax, SignedMin;
40043 if (MatchPackUS) {
40044 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
40045 SignedMin = APInt(NumSrcBits, 0);
40046 } else {
40047 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
40048 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
40049 }
40050
40051 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
40052 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
40053 return SMax;
40054
40055 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
40056 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
40057 return SMin;
40058
40059 return SDValue();
40060}
40061
40062static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
40063 SelectionDAG &DAG,
40064 const X86Subtarget &Subtarget) {
40065 if (!Subtarget.hasSSE2() || !VT.isVector())
40066 return SDValue();
40067
40068 EVT SVT = VT.getVectorElementType();
40069 EVT InVT = In.getValueType();
40070 EVT InSVT = InVT.getVectorElementType();
40071
40072 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
40073 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
40074 // and concatenate at the same time. Then we can use a final vpmovuswb to
40075 // clip to 0-255.
40076 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
40077 InVT == MVT::v16i32 && VT == MVT::v16i8) {
40078 if (auto USatVal = detectSSatPattern(In, VT, true)) {
40079 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
40080 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
40081 DL, DAG, Subtarget);
40082 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40082, __PRETTY_FUNCTION__))
;
40083 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
40084 }
40085 }
40086
40087 // vXi32 truncate instructions are available with AVX512F.
40088 // vXi16 truncate instructions are only available with AVX512BW.
40089 // For 256-bit or smaller vectors, we require VLX.
40090 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
40091 // If the result type is 256-bits or larger and we have disable 512-bit
40092 // registers, we should go ahead and use the pack instructions if possible.
40093 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
40094 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
40095 (InVT.getSizeInBits() > 128) &&
40096 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
40097 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
40098
40099 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
40100 VT.getSizeInBits() >= 64 &&
40101 (SVT == MVT::i8 || SVT == MVT::i16) &&
40102 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
40103 if (auto USatVal = detectSSatPattern(In, VT, true)) {
40104 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
40105 // Only do this when the result is at least 64 bits or we'll leaving
40106 // dangling PACKSSDW nodes.
40107 if (SVT == MVT::i8 && InSVT == MVT::i32) {
40108 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
40109 VT.getVectorNumElements());
40110 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
40111 DAG, Subtarget);
40112 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40112, __PRETTY_FUNCTION__))
;
40113 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
40114 Subtarget);
40115 assert(V && "Failed to pack!")((V && "Failed to pack!") ? static_cast<void> (
0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40115, __PRETTY_FUNCTION__))
;
40116 return V;
40117 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
40118 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
40119 Subtarget);
40120 }
40121 if (auto SSatVal = detectSSatPattern(In, VT))
40122 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
40123 Subtarget);
40124 }
40125
40126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40127 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
40128 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
40129 unsigned TruncOpc = 0;
40130 SDValue SatVal;
40131 if (auto SSatVal = detectSSatPattern(In, VT)) {
40132 SatVal = SSatVal;
40133 TruncOpc = X86ISD::VTRUNCS;
40134 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
40135 SatVal = USatVal;
40136 TruncOpc = X86ISD::VTRUNCUS;
40137 }
40138 if (SatVal) {
40139 unsigned ResElts = VT.getVectorNumElements();
40140 // If the input type is less than 512 bits and we don't have VLX, we need
40141 // to widen to 512 bits.
40142 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
40143 unsigned NumConcats = 512 / InVT.getSizeInBits();
40144 ResElts *= NumConcats;
40145 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
40146 ConcatOps[0] = SatVal;
40147 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
40148 NumConcats * InVT.getVectorNumElements());
40149 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
40150 }
40151 // Widen the result if its narrower than 128 bits.
40152 if (ResElts * SVT.getSizeInBits() < 128)
40153 ResElts = 128 / SVT.getSizeInBits();
40154 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
40155 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
40156 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
40157 DAG.getIntPtrConstant(0, DL));
40158 }
40159 }
40160
40161 return SDValue();
40162}
40163
40164/// This function detects the AVG pattern between vectors of unsigned i8/i16,
40165/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
40166/// X86ISD::AVG instruction.
40167static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
40168 const X86Subtarget &Subtarget,
40169 const SDLoc &DL) {
40170 if (!VT.isVector())
40171 return SDValue();
40172 EVT InVT = In.getValueType();
40173 unsigned NumElems = VT.getVectorNumElements();
40174
40175 EVT ScalarVT = VT.getVectorElementType();
40176 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
40177 NumElems >= 2 && isPowerOf2_32(NumElems)))
40178 return SDValue();
40179
40180 // InScalarVT is the intermediate type in AVG pattern and it should be greater
40181 // than the original input type (i8/i16).
40182 EVT InScalarVT = InVT.getVectorElementType();
40183 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
40184 return SDValue();
40185
40186 if (!Subtarget.hasSSE2())
40187 return SDValue();
40188
40189 // Detect the following pattern:
40190 //
40191 // %1 = zext <N x i8> %a to <N x i32>
40192 // %2 = zext <N x i8> %b to <N x i32>
40193 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
40194 // %4 = add nuw nsw <N x i32> %3, %2
40195 // %5 = lshr <N x i32> %N, <i32 1 x N>
40196 // %6 = trunc <N x i32> %5 to <N x i8>
40197 //
40198 // In AVX512, the last instruction can also be a trunc store.
40199 if (In.getOpcode() != ISD::SRL)
40200 return SDValue();
40201
40202 // A lambda checking the given SDValue is a constant vector and each element
40203 // is in the range [Min, Max].
40204 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
40205 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
40206 if (!BV || !BV->isConstant())
40207 return false;
40208 for (SDValue Op : V->ops()) {
40209 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
40210 if (!C)
40211 return false;
40212 const APInt &Val = C->getAPIntValue();
40213 if (Val.ult(Min) || Val.ugt(Max))
40214 return false;
40215 }
40216 return true;
40217 };
40218
40219 // Check if each element of the vector is right-shifted by one.
40220 auto LHS = In.getOperand(0);
40221 auto RHS = In.getOperand(1);
40222 if (!IsConstVectorInRange(RHS, 1, 1))
40223 return SDValue();
40224 if (LHS.getOpcode() != ISD::ADD)
40225 return SDValue();
40226
40227 // Detect a pattern of a + b + 1 where the order doesn't matter.
40228 SDValue Operands[3];
40229 Operands[0] = LHS.getOperand(0);
40230 Operands[1] = LHS.getOperand(1);
40231
40232 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40233 ArrayRef<SDValue> Ops) {
40234 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
40235 };
40236
40237 // Take care of the case when one of the operands is a constant vector whose
40238 // element is in the range [1, 256].
40239 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
40240 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
40241 Operands[0].getOperand(0).getValueType() == VT) {
40242 // The pattern is detected. Subtract one from the constant vector, then
40243 // demote it and emit X86ISD::AVG instruction.
40244 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
40245 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
40246 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
40247 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
40248 { Operands[0].getOperand(0), Operands[1] },
40249 AVGBuilder);
40250 }
40251
40252 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
40253 // Match the or case only if its 'add-like' - can be replaced by an add.
40254 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
40255 if (ISD::ADD == V.getOpcode()) {
40256 Op0 = V.getOperand(0);
40257 Op1 = V.getOperand(1);
40258 return true;
40259 }
40260 if (ISD::ZERO_EXTEND != V.getOpcode())
40261 return false;
40262 V = V.getOperand(0);
40263 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
40264 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
40265 return false;
40266 Op0 = V.getOperand(0);
40267 Op1 = V.getOperand(1);
40268 return true;
40269 };
40270
40271 SDValue Op0, Op1;
40272 if (FindAddLike(Operands[0], Op0, Op1))
40273 std::swap(Operands[0], Operands[1]);
40274 else if (!FindAddLike(Operands[1], Op0, Op1))
40275 return SDValue();
40276 Operands[2] = Op0;
40277 Operands[1] = Op1;
40278
40279 // Now we have three operands of two additions. Check that one of them is a
40280 // constant vector with ones, and the other two can be promoted from i8/i16.
40281 for (int i = 0; i < 3; ++i) {
40282 if (!IsConstVectorInRange(Operands[i], 1, 1))
40283 continue;
40284 std::swap(Operands[i], Operands[2]);
40285
40286 // Check if Operands[0] and Operands[1] are results of type promotion.
40287 for (int j = 0; j < 2; ++j)
40288 if (Operands[j].getValueType() != VT) {
40289 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
40290 Operands[j].getOperand(0).getValueType() != VT)
40291 return SDValue();
40292 Operands[j] = Operands[j].getOperand(0);
40293 }
40294
40295 // The pattern is detected, emit X86ISD::AVG instruction(s).
40296 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
40297 AVGBuilder);
40298 }
40299
40300 return SDValue();
40301}
40302
40303static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
40304 TargetLowering::DAGCombinerInfo &DCI,
40305 const X86Subtarget &Subtarget) {
40306 LoadSDNode *Ld = cast<LoadSDNode>(N);
40307 EVT RegVT = Ld->getValueType(0);
40308 EVT MemVT = Ld->getMemoryVT();
40309 SDLoc dl(Ld);
40310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40311
40312 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
40313 // into two 16-byte operations. Also split non-temporal aligned loads on
40314 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
40315 ISD::LoadExtType Ext = Ld->getExtensionType();
40316 bool Fast;
40317 unsigned Alignment = Ld->getAlignment();
40318 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
40319 Ext == ISD::NON_EXTLOAD &&
40320 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
40321 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
40322 *Ld->getMemOperand(), &Fast) &&
40323 !Fast))) {
40324 unsigned NumElems = RegVT.getVectorNumElements();
40325 if (NumElems < 2)
40326 return SDValue();
40327
40328 unsigned HalfAlign = 16;
40329 SDValue Ptr1 = Ld->getBasePtr();
40330 SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
40331 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
40332 NumElems / 2);
40333 SDValue Load1 =
40334 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
40335 Alignment, Ld->getMemOperand()->getFlags());
40336 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
40337 Ld->getPointerInfo().getWithOffset(HalfAlign),
40338 MinAlign(Alignment, HalfAlign),
40339 Ld->getMemOperand()->getFlags());
40340 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
40341 Load1.getValue(1), Load2.getValue(1));
40342
40343 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
40344 return DCI.CombineTo(N, NewVec, TF, true);
40345 }
40346
40347 // Bool vector load - attempt to cast to an integer, as we have good
40348 // (vXiY *ext(vXi1 bitcast(iX))) handling.
40349 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
40350 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
40351 unsigned NumElts = RegVT.getVectorNumElements();
40352 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40353 if (TLI.isTypeLegal(IntVT)) {
40354 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
40355 Ld->getPointerInfo(), Alignment,
40356 Ld->getMemOperand()->getFlags());
40357 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
40358 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
40359 }
40360 }
40361
40362 return SDValue();
40363}
40364
40365/// If V is a build vector of boolean constants and exactly one of those
40366/// constants is true, return the operand index of that true element.
40367/// Otherwise, return -1.
40368static int getOneTrueElt(SDValue V) {
40369 // This needs to be a build vector of booleans.
40370 // TODO: Checking for the i1 type matches the IR definition for the mask,
40371 // but the mask check could be loosened to i8 or other types. That might
40372 // also require checking more than 'allOnesValue'; eg, the x86 HW
40373 // instructions only require that the MSB is set for each mask element.
40374 // The ISD::MSTORE comments/definition do not specify how the mask operand
40375 // is formatted.
40376 auto *BV = dyn_cast<BuildVectorSDNode>(V);
40377 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
40378 return -1;
40379
40380 int TrueIndex = -1;
40381 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
40382 for (unsigned i = 0; i < NumElts; ++i) {
40383 const SDValue &Op = BV->getOperand(i);
40384 if (Op.isUndef())
40385 continue;
40386 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
40387 if (!ConstNode)
40388 return -1;
40389 if (ConstNode->getAPIntValue().isAllOnesValue()) {
40390 // If we already found a one, this is too many.
40391 if (TrueIndex >= 0)
40392 return -1;
40393 TrueIndex = i;
40394 }
40395 }
40396 return TrueIndex;
40397}
40398
40399/// Given a masked memory load/store operation, return true if it has one mask
40400/// bit set. If it has one mask bit set, then also return the memory address of
40401/// the scalar element to load/store, the vector index to insert/extract that
40402/// scalar element, and the alignment for the scalar memory access.
40403static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
40404 SelectionDAG &DAG, SDValue &Addr,
40405 SDValue &Index, unsigned &Alignment) {
40406 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
40407 if (TrueMaskElt < 0)
40408 return false;
40409
40410 // Get the address of the one scalar element that is specified by the mask
40411 // using the appropriate offset from the base pointer.
40412 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
40413 Addr = MaskedOp->getBasePtr();
40414 if (TrueMaskElt != 0) {
40415 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
40416 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
40417 }
40418
40419 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
40420 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
40421 return true;
40422}
40423
40424/// If exactly one element of the mask is set for a non-extending masked load,
40425/// it is a scalar load and vector insert.
40426/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
40427/// mask have already been optimized in IR, so we don't bother with those here.
40428static SDValue
40429reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
40430 TargetLowering::DAGCombinerInfo &DCI) {
40431 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
40432 // However, some target hooks may need to be added to know when the transform
40433 // is profitable. Endianness would also have to be considered.
40434
40435 SDValue Addr, VecIndex;
40436 unsigned Alignment;
40437 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
40438 return SDValue();
40439
40440 // Load the one scalar element that is specified by the mask using the
40441 // appropriate offset from the base pointer.
40442 SDLoc DL(ML);
40443 EVT VT = ML->getValueType(0);
40444 EVT EltVT = VT.getVectorElementType();
40445 SDValue Load =
40446 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
40447 Alignment, ML->getMemOperand()->getFlags());
40448
40449 // Insert the loaded element into the appropriate place in the vector.
40450 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
40451 ML->getPassThru(), Load, VecIndex);
40452 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
40453}
40454
40455static SDValue
40456combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
40457 TargetLowering::DAGCombinerInfo &DCI) {
40458 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
40459 return SDValue();
40460
40461 SDLoc DL(ML);
40462 EVT VT = ML->getValueType(0);
40463
40464 // If we are loading the first and last elements of a vector, it is safe and
40465 // always faster to load the whole vector. Replace the masked load with a
40466 // vector load and select.
40467 unsigned NumElts = VT.getVectorNumElements();
40468 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
40469 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
40470 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
40471 if (LoadFirstElt && LoadLastElt) {
40472 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
40473 ML->getMemOperand());
40474 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
40475 ML->getPassThru());
40476 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
40477 }
40478
40479 // Convert a masked load with a constant mask into a masked load and a select.
40480 // This allows the select operation to use a faster kind of select instruction
40481 // (for example, vblendvps -> vblendps).
40482
40483 // Don't try this if the pass-through operand is already undefined. That would
40484 // cause an infinite loop because that's what we're about to create.
40485 if (ML->getPassThru().isUndef())
40486 return SDValue();
40487
40488 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
40489 return SDValue();
40490
40491 // The new masked load has an undef pass-through operand. The select uses the
40492 // original pass-through operand.
40493 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
40494 ML->getMask(), DAG.getUNDEF(VT),
40495 ML->getMemoryVT(), ML->getMemOperand(),
40496 ML->getExtensionType());
40497 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
40498 ML->getPassThru());
40499
40500 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
40501}
40502
40503static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
40504 TargetLowering::DAGCombinerInfo &DCI,
40505 const X86Subtarget &Subtarget) {
40506 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
40507
40508 // TODO: Expanding load with constant mask may be optimized as well.
40509 if (Mld->isExpandingLoad())
40510 return SDValue();
40511
40512 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
40513 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
40514 return ScalarLoad;
40515 // TODO: Do some AVX512 subsets benefit from this transform?
40516 if (!Subtarget.hasAVX512())
40517 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
40518 return Blend;
40519 }
40520
40521 return SDValue();
40522}
40523
40524/// If exactly one element of the mask is set for a non-truncating masked store,
40525/// it is a vector extract and scalar store.
40526/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
40527/// mask have already been optimized in IR, so we don't bother with those here.
40528static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
40529 SelectionDAG &DAG) {
40530 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
40531 // However, some target hooks may need to be added to know when the transform
40532 // is profitable. Endianness would also have to be considered.
40533
40534 SDValue Addr, VecIndex;
40535 unsigned Alignment;
40536 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
40537 return SDValue();
40538
40539 // Extract the one scalar element that is actually being stored.
40540 SDLoc DL(MS);
40541 EVT VT = MS->getValue().getValueType();
40542 EVT EltVT = VT.getVectorElementType();
40543 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
40544 MS->getValue(), VecIndex);
40545
40546 // Store that element at the appropriate offset from the base pointer.
40547 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
40548 Alignment, MS->getMemOperand()->getFlags());
40549}
40550
40551static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
40552 TargetLowering::DAGCombinerInfo &DCI,
40553 const X86Subtarget &Subtarget) {
40554 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
40555 if (Mst->isCompressingStore())
40556 return SDValue();
40557
40558 EVT VT = Mst->getValue().getValueType();
40559 SDLoc dl(Mst);
40560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40561
40562 if (Mst->isTruncatingStore())
40563 return SDValue();
40564
40565 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
40566 return ScalarStore;
40567
40568 // If the mask value has been legalized to a non-boolean vector, try to
40569 // simplify ops leading up to it. We only demand the MSB of each lane.
40570 SDValue Mask = Mst->getMask();
40571 if (Mask.getScalarValueSizeInBits() != 1) {
40572 APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
40573 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
40574 return SDValue(N, 0);
40575 }
40576
40577 SDValue Value = Mst->getValue();
40578 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
40579 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
40580 Mst->getMemoryVT())) {
40581 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
40582 Mst->getBasePtr(), Mask,
40583 Mst->getMemoryVT(), Mst->getMemOperand(), true);
40584 }
40585
40586 return SDValue();
40587}
40588
40589static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
40590 TargetLowering::DAGCombinerInfo &DCI,
40591 const X86Subtarget &Subtarget) {
40592 StoreSDNode *St = cast<StoreSDNode>(N);
40593 EVT StVT = St->getMemoryVT();
40594 SDLoc dl(St);
40595 unsigned Alignment = St->getAlignment();
40596 SDValue StoredVal = St->getValue();
40597 EVT VT = StoredVal.getValueType();
40598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40599
40600 // Convert a store of vXi1 into a store of iX and a bitcast.
40601 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
40602 VT.getVectorElementType() == MVT::i1) {
40603
40604 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
40605 StoredVal = DAG.getBitcast(NewVT, StoredVal);
40606
40607 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
40608 St->getPointerInfo(), St->getAlignment(),
40609 St->getMemOperand()->getFlags());
40610 }
40611
40612 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
40613 // This will avoid a copy to k-register.
40614 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
40615 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40616 StoredVal.getOperand(0).getValueType() == MVT::i8) {
40617 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
40618 St->getBasePtr(), St->getPointerInfo(),
40619 St->getAlignment(), St->getMemOperand()->getFlags());
40620 }
40621
40622 // Widen v2i1/v4i1 stores to v8i1.
40623 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
40624 Subtarget.hasAVX512()) {
40625 unsigned NumConcats = 8 / VT.getVectorNumElements();
40626 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
40627 Ops[0] = StoredVal;
40628 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40629 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
40630 St->getPointerInfo(), St->getAlignment(),
40631 St->getMemOperand()->getFlags());
40632 }
40633
40634 // Turn vXi1 stores of constants into a scalar store.
40635 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
40636 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
40637 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
40638 // If its a v64i1 store without 64-bit support, we need two stores.
40639 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
40640 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
40641 StoredVal->ops().slice(0, 32));
40642 Lo = combinevXi1ConstantToInteger(Lo, DAG);
40643 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
40644 StoredVal->ops().slice(32, 32));
40645 Hi = combinevXi1ConstantToInteger(Hi, DAG);
40646
40647 SDValue Ptr0 = St->getBasePtr();
40648 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
40649
40650 SDValue Ch0 =
40651 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
40652 Alignment, St->getMemOperand()->getFlags());
40653 SDValue Ch1 =
40654 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
40655 St->getPointerInfo().getWithOffset(4),
40656 MinAlign(Alignment, 4U),
40657 St->getMemOperand()->getFlags());
40658 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
40659 }
40660
40661 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
40662 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
40663 St->getPointerInfo(), St->getAlignment(),
40664 St->getMemOperand()->getFlags());
40665 }
40666
40667 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
40668 // Sandy Bridge, perform two 16-byte stores.
40669 bool Fast;
40670 if (VT.is256BitVector() && StVT == VT &&
40671 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
40672 *St->getMemOperand(), &Fast) &&
40673 !Fast) {
40674 unsigned NumElems = VT.getVectorNumElements();
40675 if (NumElems < 2)
40676 return SDValue();
40677
40678 return splitVectorStore(St, DAG);
40679 }
40680
40681 // Split under-aligned vector non-temporal stores.
40682 if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
40683 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
40684 // vectors or the legalizer can scalarize it to use MOVNTI.
40685 if (VT.is256BitVector() || VT.is512BitVector()) {
40686 unsigned NumElems = VT.getVectorNumElements();
40687 if (NumElems < 2)
40688 return SDValue();
40689 return splitVectorStore(St, DAG);
40690 }
40691
40692 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
40693 // to use MOVNTI.
40694 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
40695 MVT NTVT = Subtarget.hasSSE4A()
40696 ? MVT::v2f64
40697 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
40698 return scalarizeVectorStore(St, NTVT, DAG);
40699 }
40700 }
40701
40702 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
40703 // supported, but avx512f is by extending to v16i32 and truncating.
40704 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
40705 St->getValue().getOpcode() == ISD::TRUNCATE &&
40706 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
40707 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
40708 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
40709 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
40710 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
40711 MVT::v16i8, St->getMemOperand());
40712 }
40713
40714 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
40715 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
40716 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
40717 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
40718 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
40719 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
40720 return EmitTruncSStore(IsSigned, St->getChain(),
40721 dl, StoredVal.getOperand(0), St->getBasePtr(),
40722 VT, St->getMemOperand(), DAG);
40723 }
40724
40725 // Optimize trunc store (of multiple scalars) to shuffle and store.
40726 // First, pack all of the elements in one place. Next, store to memory
40727 // in fewer chunks.
40728 if (St->isTruncatingStore() && VT.isVector()) {
40729 // Check if we can detect an AVG pattern from the truncation. If yes,
40730 // replace the trunc store by a normal store with the result of X86ISD::AVG
40731 // instruction.
40732 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
40733 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
40734 Subtarget, dl))
40735 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
40736 St->getPointerInfo(), St->getAlignment(),
40737 St->getMemOperand()->getFlags());
40738
40739 if (TLI.isTruncStoreLegal(VT, StVT)) {
40740 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
40741 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
40742 dl, Val, St->getBasePtr(),
40743 St->getMemoryVT(), St->getMemOperand(), DAG);
40744 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
40745 DAG, dl))
40746 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
40747 dl, Val, St->getBasePtr(),
40748 St->getMemoryVT(), St->getMemOperand(), DAG);
40749 }
40750
40751 return SDValue();
40752 }
40753
40754 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
40755 // the FP state in cases where an emms may be missing.
40756 // A preferable solution to the general problem is to figure out the right
40757 // places to insert EMMS. This qualifies as a quick hack.
40758
40759 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
40760 if (VT.getSizeInBits() != 64)
40761 return SDValue();
40762
40763 const Function &F = DAG.getMachineFunction().getFunction();
40764 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
40765 bool F64IsLegal =
40766 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
40767 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
40768 isa<LoadSDNode>(St->getValue()) &&
40769 cast<LoadSDNode>(St->getValue())->isSimple() &&
40770 St->getChain().hasOneUse() && St->isSimple()) {
40771 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
40772
40773 if (!ISD::isNormalLoad(Ld))
40774 return SDValue();
40775
40776 // Avoid the transformation if there are multiple uses of the loaded value.
40777 if (!Ld->hasNUsesOfValue(1, 0))
40778 return SDValue();
40779
40780 SDLoc LdDL(Ld);
40781 SDLoc StDL(N);
40782 // Lower to a single movq load/store pair.
40783 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
40784 Ld->getBasePtr(), Ld->getMemOperand());
40785
40786 // Make sure new load is placed in same chain order.
40787 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
40788 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
40789 St->getMemOperand());
40790 }
40791
40792 // This is similar to the above case, but here we handle a scalar 64-bit
40793 // integer store that is extracted from a vector on a 32-bit target.
40794 // If we have SSE2, then we can treat it like a floating-point double
40795 // to get past legalization. The execution dependencies fixup pass will
40796 // choose the optimal machine instruction for the store if this really is
40797 // an integer or v2f32 rather than an f64.
40798 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
40799 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
40800 SDValue OldExtract = St->getOperand(1);
40801 SDValue ExtOp0 = OldExtract.getOperand(0);
40802 unsigned VecSize = ExtOp0.getValueSizeInBits();
40803 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
40804 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
40805 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
40806 BitCast, OldExtract.getOperand(1));
40807 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
40808 St->getPointerInfo(), St->getAlignment(),
40809 St->getMemOperand()->getFlags());
40810 }
40811
40812 return SDValue();
40813}
40814
40815/// Return 'true' if this vector operation is "horizontal"
40816/// and return the operands for the horizontal operation in LHS and RHS. A
40817/// horizontal operation performs the binary operation on successive elements
40818/// of its first operand, then on successive elements of its second operand,
40819/// returning the resulting values in a vector. For example, if
40820/// A = < float a0, float a1, float a2, float a3 >
40821/// and
40822/// B = < float b0, float b1, float b2, float b3 >
40823/// then the result of doing a horizontal operation on A and B is
40824/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
40825/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
40826/// A horizontal-op B, for some already available A and B, and if so then LHS is
40827/// set to A, RHS to B, and the routine returns 'true'.
40828static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
40829 const X86Subtarget &Subtarget,
40830 bool IsCommutative) {
40831 // If either operand is undef, bail out. The binop should be simplified.
40832 if (LHS.isUndef() || RHS.isUndef())
40833 return false;
40834
40835 // Look for the following pattern:
40836 // A = < float a0, float a1, float a2, float a3 >
40837 // B = < float b0, float b1, float b2, float b3 >
40838 // and
40839 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
40840 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
40841 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
40842 // which is A horizontal-op B.
40843
40844 MVT VT = LHS.getSimpleValueType();
40845 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40846, __PRETTY_FUNCTION__))
40846 "Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40846, __PRETTY_FUNCTION__))
;
40847 unsigned NumElts = VT.getVectorNumElements();
40848
40849 // TODO - can we make a general helper method that does all of this for us?
40850 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
40851 SmallVectorImpl<int> &ShuffleMask) {
40852 if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
40853 if (!Op.getOperand(0).isUndef())
40854 N0 = Op.getOperand(0);
40855 if (!Op.getOperand(1).isUndef())
40856 N1 = Op.getOperand(1);
40857 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
40858 ShuffleMask.append(Mask.begin(), Mask.end());
40859 return;
40860 }
40861 bool UseSubVector = false;
40862 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40863 Op.getOperand(0).getValueType().is256BitVector() &&
40864 llvm::isNullConstant(Op.getOperand(1))) {
40865 Op = Op.getOperand(0);
40866 UseSubVector = true;
40867 }
40868 bool IsUnary;
40869 SmallVector<SDValue, 2> SrcOps;
40870 SmallVector<int, 16> SrcShuffleMask;
40871 SDValue BC = peekThroughBitcasts(Op);
40872 if (isTargetShuffle(BC.getOpcode()) &&
40873 getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
40874 SrcOps, SrcShuffleMask, IsUnary)) {
40875 if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
40876 SrcOps.size() <= 2) {
40877 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
40878 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
40879 ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
40880 }
40881 if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
40882 SrcOps.size() == 1) {
40883 N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
40884 N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
40885 ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
40886 ShuffleMask.append(Mask.begin(), Mask.end());
40887 }
40888 }
40889 };
40890
40891 // View LHS in the form
40892 // LHS = VECTOR_SHUFFLE A, B, LMask
40893 // If LHS is not a shuffle, then pretend it is the identity shuffle:
40894 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
40895 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
40896 SDValue A, B;
40897 SmallVector<int, 16> LMask;
40898 GetShuffle(LHS, A, B, LMask);
40899
40900 // Likewise, view RHS in the form
40901 // RHS = VECTOR_SHUFFLE C, D, RMask
40902 SDValue C, D;
40903 SmallVector<int, 16> RMask;
40904 GetShuffle(RHS, C, D, RMask);
40905
40906 // At least one of the operands should be a vector shuffle.
40907 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
40908 if (NumShuffles == 0)
40909 return false;
40910
40911 if (LMask.empty()) {
40912 A = LHS;
40913 for (unsigned i = 0; i != NumElts; ++i)
40914 LMask.push_back(i);
40915 }
40916
40917 if (RMask.empty()) {
40918 C = RHS;
40919 for (unsigned i = 0; i != NumElts; ++i)
40920 RMask.push_back(i);
40921 }
40922
40923 // If A and B occur in reverse order in RHS, then canonicalize by commuting
40924 // RHS operands and shuffle mask.
40925 if (A != C) {
40926 std::swap(C, D);
40927 ShuffleVectorSDNode::commuteMask(RMask);
40928 }
40929 // Check that the shuffles are both shuffling the same vectors.
40930 if (!(A == C && B == D))
40931 return false;
40932
40933 // LHS and RHS are now:
40934 // LHS = shuffle A, B, LMask
40935 // RHS = shuffle A, B, RMask
40936 // Check that the masks correspond to performing a horizontal operation.
40937 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
40938 // so we just repeat the inner loop if this is a 256-bit op.
40939 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
40940 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
40941 assert((NumEltsPer128BitChunk % 2 == 0) &&(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40942, __PRETTY_FUNCTION__))
40942 "Vector type should have an even number of elements in each lane")(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40942, __PRETTY_FUNCTION__))
;
40943 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
40944 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
40945 // Ignore undefined components.
40946 int LIdx = LMask[i + j], RIdx = RMask[i + j];
40947 if (LIdx < 0 || RIdx < 0 ||
40948 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
40949 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
40950 continue;
40951
40952 // The low half of the 128-bit result must choose from A.
40953 // The high half of the 128-bit result must choose from B,
40954 // unless B is undef. In that case, we are always choosing from A.
40955 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
40956 unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
40957
40958 // Check that successive elements are being operated on. If not, this is
40959 // not a horizontal operation.
40960 int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
40961 if (!(LIdx == Index && RIdx == Index + 1) &&
40962 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
40963 return false;
40964 }
40965 }
40966
40967 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
40968 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
40969
40970 if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
40971 return false;
40972
40973 LHS = DAG.getBitcast(VT, LHS);
40974 RHS = DAG.getBitcast(VT, RHS);
40975 return true;
40976}
40977
40978/// Do target-specific dag combines on floating-point adds/subs.
40979static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
40980 const X86Subtarget &Subtarget) {
40981 EVT VT = N->getValueType(0);
40982 SDValue LHS = N->getOperand(0);
40983 SDValue RHS = N->getOperand(1);
40984 bool IsFadd = N->getOpcode() == ISD::FADD;
40985 auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
40986 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40986, __PRETTY_FUNCTION__))
;
40987
40988 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
40989 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
40990 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
40991 isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
40992 return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
40993
40994 return SDValue();
40995}
40996
40997/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
40998/// the codegen.
40999/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
41000/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
41001/// anything that is guaranteed to be transformed by DAGCombiner.
41002static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
41003 const X86Subtarget &Subtarget,
41004 const SDLoc &DL) {
41005 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41005, __PRETTY_FUNCTION__))
;
41006 SDValue Src = N->getOperand(0);
41007 unsigned SrcOpcode = Src.getOpcode();
41008 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41009
41010 EVT VT = N->getValueType(0);
41011 EVT SrcVT = Src.getValueType();
41012
41013 auto IsFreeTruncation = [VT](SDValue Op) {
41014 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
41015
41016 // See if this has been extended from a smaller/equal size to
41017 // the truncation size, allowing a truncation to combine with the extend.
41018 unsigned Opcode = Op.getOpcode();
41019 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
41020 Opcode == ISD::ZERO_EXTEND) &&
41021 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
41022 return true;
41023
41024 // See if this is a single use constant which can be constant folded.
41025 // NOTE: We don't peek throught bitcasts here because there is currently
41026 // no support for constant folding truncate+bitcast+vector_of_constants. So
41027 // we'll just send up with a truncate on both operands which will
41028 // get turned back into (truncate (binop)) causing an infinite loop.
41029 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
41030 };
41031
41032 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
41033 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
41034 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
41035 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
41036 };
41037
41038 // Don't combine if the operation has other uses.
41039 if (!Src.hasOneUse())
41040 return SDValue();
41041
41042 // Only support vector truncation for now.
41043 // TODO: i64 scalar math would benefit as well.
41044 if (!VT.isVector())
41045 return SDValue();
41046
41047 // In most cases its only worth pre-truncating if we're only facing the cost
41048 // of one truncation.
41049 // i.e. if one of the inputs will constant fold or the input is repeated.
41050 switch (SrcOpcode) {
41051 case ISD::AND:
41052 case ISD::XOR:
41053 case ISD::OR: {
41054 SDValue Op0 = Src.getOperand(0);
41055 SDValue Op1 = Src.getOperand(1);
41056 if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
41057 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
41058 return TruncateArithmetic(Op0, Op1);
41059 break;
41060 }
41061
41062 case ISD::MUL:
41063 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
41064 // better to truncate if we have the chance.
41065 if (SrcVT.getScalarType() == MVT::i64 &&
41066 TLI.isOperationLegal(SrcOpcode, VT) &&
41067 !TLI.isOperationLegal(SrcOpcode, SrcVT))
41068 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
41069 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41070 case ISD::ADD: {
41071 SDValue Op0 = Src.getOperand(0);
41072 SDValue Op1 = Src.getOperand(1);
41073 if (TLI.isOperationLegal(SrcOpcode, VT) &&
41074 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
41075 return TruncateArithmetic(Op0, Op1);
41076 break;
41077 }
41078 case ISD::SUB: {
41079 // TODO: ISD::SUB We are conservative and require both sides to be freely
41080 // truncatable to avoid interfering with combineSubToSubus.
41081 SDValue Op0 = Src.getOperand(0);
41082 SDValue Op1 = Src.getOperand(1);
41083 if (TLI.isOperationLegal(SrcOpcode, VT) &&
41084 (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
41085 return TruncateArithmetic(Op0, Op1);
41086 break;
41087 }
41088 }
41089
41090 return SDValue();
41091}
41092
41093/// Truncate using ISD::AND mask and X86ISD::PACKUS.
41094/// e.g. trunc <8 x i32> X to <8 x i16> -->
41095/// MaskX = X & 0xffff (clear high bits to prevent saturation)
41096/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
41097static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
41098 const X86Subtarget &Subtarget,
41099 SelectionDAG &DAG) {
41100 SDValue In = N->getOperand(0);
41101 EVT InVT = In.getValueType();
41102 EVT OutVT = N->getValueType(0);
41103
41104 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
41105 OutVT.getScalarSizeInBits());
41106 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
41107 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
41108}
41109
41110/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
41111static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
41112 const X86Subtarget &Subtarget,
41113 SelectionDAG &DAG) {
41114 SDValue In = N->getOperand(0);
41115 EVT InVT = In.getValueType();
41116 EVT OutVT = N->getValueType(0);
41117 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
41118 DAG.getValueType(OutVT));
41119 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
41120}
41121
41122/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
41123/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
41124/// legalization the truncation will be translated into a BUILD_VECTOR with each
41125/// element that is extracted from a vector and then truncated, and it is
41126/// difficult to do this optimization based on them.
41127static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
41128 const X86Subtarget &Subtarget) {
41129 EVT OutVT = N->getValueType(0);
41130 if (!OutVT.isVector())
41131 return SDValue();
41132
41133 SDValue In = N->getOperand(0);
41134 if (!In.getValueType().isSimple())
41135 return SDValue();
41136
41137 EVT InVT = In.getValueType();
41138 unsigned NumElems = OutVT.getVectorNumElements();
41139
41140 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
41141 // SSE2, and we need to take care of it specially.
41142 // AVX512 provides vpmovdb.
41143 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
41144 return SDValue();
41145
41146 EVT OutSVT = OutVT.getVectorElementType();
41147 EVT InSVT = InVT.getVectorElementType();
41148 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
41149 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
41150 NumElems >= 8))
41151 return SDValue();
41152
41153 // SSSE3's pshufb results in less instructions in the cases below.
41154 if (Subtarget.hasSSSE3() && NumElems == 8 &&
41155 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
41156 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
41157 return SDValue();
41158
41159 SDLoc DL(N);
41160 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
41161 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
41162 // truncate 2 x v4i32 to v8i16.
41163 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
41164 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
41165 if (InSVT == MVT::i32)
41166 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
41167
41168 return SDValue();
41169}
41170
41171/// This function transforms vector truncation of 'extended sign-bits' or
41172/// 'extended zero-bits' values.
41173/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
41174static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
41175 SelectionDAG &DAG,
41176 const X86Subtarget &Subtarget) {
41177 // Requires SSE2.
41178 if (!Subtarget.hasSSE2())
41179 return SDValue();
41180
41181 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
41182 return SDValue();
41183
41184 SDValue In = N->getOperand(0);
41185 if (!In.getValueType().isSimple())
41186 return SDValue();
41187
41188 MVT VT = N->getValueType(0).getSimpleVT();
41189 MVT SVT = VT.getScalarType();
41190
41191 MVT InVT = In.getValueType().getSimpleVT();
41192 MVT InSVT = InVT.getScalarType();
41193
41194 // Check we have a truncation suited for PACKSS/PACKUS.
41195 if (!VT.is128BitVector() && !VT.is256BitVector())
41196 return SDValue();
41197 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
41198 return SDValue();
41199 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
41200 return SDValue();
41201
41202 // AVX512 has fast truncate, but if the input is already going to be split,
41203 // there's no harm in trying pack.
41204 if (Subtarget.hasAVX512() &&
41205 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
41206 InVT.is512BitVector()))
41207 return SDValue();
41208
41209 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
41210 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
41211
41212 // Use PACKUS if the input has zero-bits that extend all the way to the
41213 // packed/truncated value. e.g. masks, zext_in_reg, etc.
41214 KnownBits Known = DAG.computeKnownBits(In);
41215 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
41216 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
41217 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
41218
41219 // Use PACKSS if the input has sign-bits that extend all the way to the
41220 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
41221 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
41222 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
41223 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
41224
41225 return SDValue();
41226}
41227
41228// Try to form a MULHU or MULHS node by looking for
41229// (trunc (srl (mul ext, ext), 16))
41230// TODO: This is X86 specific because we want to be able to handle wide types
41231// before type legalization. But we can only do it if the vector will be
41232// legalized via widening/splitting. Type legalization can't handle promotion
41233// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
41234// combiner.
41235static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
41236 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41237 // First instruction should be a right shift of a multiply.
41238 if (Src.getOpcode() != ISD::SRL ||
41239 Src.getOperand(0).getOpcode() != ISD::MUL)
41240 return SDValue();
41241
41242 if (!Subtarget.hasSSE2())
41243 return SDValue();
41244
41245 // Only handle vXi16 types that are at least 128-bits unless they will be
41246 // widened.
41247 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
41248 return SDValue();
41249
41250 // Input type should be vXi32.
41251 EVT InVT = Src.getValueType();
41252 if (InVT.getVectorElementType() != MVT::i32)
41253 return SDValue();
41254
41255 // Need a shift by 16.
41256 APInt ShiftAmt;
41257 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
41258 ShiftAmt != 16)
41259 return SDValue();
41260
41261 SDValue LHS = Src.getOperand(0).getOperand(0);
41262 SDValue RHS = Src.getOperand(0).getOperand(1);
41263
41264 unsigned ExtOpc = LHS.getOpcode();
41265 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
41266 RHS.getOpcode() != ExtOpc)
41267 return SDValue();
41268
41269 // Peek through the extends.
41270 LHS = LHS.getOperand(0);
41271 RHS = RHS.getOperand(0);
41272
41273 // Ensure the input types match.
41274 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
41275 return SDValue();
41276
41277 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
41278 return DAG.getNode(Opc, DL, VT, LHS, RHS);
41279}
41280
41281// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
41282// from one vector with signed bytes from another vector, adds together
41283// adjacent pairs of 16-bit products, and saturates the result before
41284// truncating to 16-bits.
41285//
41286// Which looks something like this:
41287// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
41288// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
41289static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
41290 const X86Subtarget &Subtarget,
41291 const SDLoc &DL) {
41292 if (!VT.isVector() || !Subtarget.hasSSSE3())
41293 return SDValue();
41294
41295 unsigned NumElems = VT.getVectorNumElements();
41296 EVT ScalarVT = VT.getVectorElementType();
41297 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
41298 return SDValue();
41299
41300 SDValue SSatVal = detectSSatPattern(In, VT);
41301 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
41302 return SDValue();
41303
41304 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
41305 // of multiplies from even/odd elements.
41306 SDValue N0 = SSatVal.getOperand(0);
41307 SDValue N1 = SSatVal.getOperand(1);
41308
41309 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
41310 return SDValue();
41311
41312 SDValue N00 = N0.getOperand(0);
41313 SDValue N01 = N0.getOperand(1);
41314 SDValue N10 = N1.getOperand(0);
41315 SDValue N11 = N1.getOperand(1);
41316
41317 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
41318 // Canonicalize zero_extend to LHS.
41319 if (N01.getOpcode() == ISD::ZERO_EXTEND)
41320 std::swap(N00, N01);
41321 if (N11.getOpcode() == ISD::ZERO_EXTEND)
41322 std::swap(N10, N11);
41323
41324 // Ensure we have a zero_extend and a sign_extend.
41325 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
41326 N01.getOpcode() != ISD::SIGN_EXTEND ||
41327 N10.getOpcode() != ISD::ZERO_EXTEND ||
41328 N11.getOpcode() != ISD::SIGN_EXTEND)
41329 return SDValue();
41330
41331 // Peek through the extends.
41332 N00 = N00.getOperand(0);
41333 N01 = N01.getOperand(0);
41334 N10 = N10.getOperand(0);
41335 N11 = N11.getOperand(0);
41336
41337 // Ensure the extend is from vXi8.
41338 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
41339 N01.getValueType().getVectorElementType() != MVT::i8 ||
41340 N10.getValueType().getVectorElementType() != MVT::i8 ||
41341 N11.getValueType().getVectorElementType() != MVT::i8)
41342 return SDValue();
41343
41344 // All inputs should be build_vectors.
41345 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
41346 N01.getOpcode() != ISD::BUILD_VECTOR ||
41347 N10.getOpcode() != ISD::BUILD_VECTOR ||
41348 N11.getOpcode() != ISD::BUILD_VECTOR)
41349 return SDValue();
41350
41351 // N00/N10 are zero extended. N01/N11 are sign extended.
41352
41353 // For each element, we need to ensure we have an odd element from one vector
41354 // multiplied by the odd element of another vector and the even element from
41355 // one of the same vectors being multiplied by the even element from the
41356 // other vector. So we need to make sure for each element i, this operator
41357 // is being performed:
41358 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
41359 SDValue ZExtIn, SExtIn;
41360 for (unsigned i = 0; i != NumElems; ++i) {
41361 SDValue N00Elt = N00.getOperand(i);
41362 SDValue N01Elt = N01.getOperand(i);
41363 SDValue N10Elt = N10.getOperand(i);
41364 SDValue N11Elt = N11.getOperand(i);
41365 // TODO: Be more tolerant to undefs.
41366 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
41367 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
41368 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
41369 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
41370 return SDValue();
41371 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
41372 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
41373 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
41374 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
41375 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
41376 return SDValue();
41377 unsigned IdxN00 = ConstN00Elt->getZExtValue();
41378 unsigned IdxN01 = ConstN01Elt->getZExtValue();
41379 unsigned IdxN10 = ConstN10Elt->getZExtValue();
41380 unsigned IdxN11 = ConstN11Elt->getZExtValue();
41381 // Add is commutative so indices can be reordered.
41382 if (IdxN00 > IdxN10) {
41383 std::swap(IdxN00, IdxN10);
41384 std::swap(IdxN01, IdxN11);
41385 }
41386 // N0 indices be the even element. N1 indices must be the next odd element.
41387 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
41388 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
41389 return SDValue();
41390 SDValue N00In = N00Elt.getOperand(0);
41391 SDValue N01In = N01Elt.getOperand(0);
41392 SDValue N10In = N10Elt.getOperand(0);
41393 SDValue N11In = N11Elt.getOperand(0);
41394 // First time we find an input capture it.
41395 if (!ZExtIn) {
41396 ZExtIn = N00In;
41397 SExtIn = N01In;
41398 }
41399 if (ZExtIn != N00In || SExtIn != N01In ||
41400 ZExtIn != N10In || SExtIn != N11In)
41401 return SDValue();
41402 }
41403
41404 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41405 ArrayRef<SDValue> Ops) {
41406 // Shrink by adding truncate nodes and let DAGCombine fold with the
41407 // sources.
41408 EVT InVT = Ops[0].getValueType();
41409 assert(InVT.getScalarType() == MVT::i8 &&((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41410, __PRETTY_FUNCTION__))
41410 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41410, __PRETTY_FUNCTION__))
;
41411 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41411, __PRETTY_FUNCTION__))
;
41412 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
41413 InVT.getVectorNumElements() / 2);
41414 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
41415 };
41416 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
41417 PMADDBuilder);
41418}
41419
41420static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
41421 const X86Subtarget &Subtarget) {
41422 EVT VT = N->getValueType(0);
41423 SDValue Src = N->getOperand(0);
41424 SDLoc DL(N);
41425
41426 // Attempt to pre-truncate inputs to arithmetic ops instead.
41427 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
41428 return V;
41429
41430 // Try to detect AVG pattern first.
41431 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
41432 return Avg;
41433
41434 // Try to detect PMADD
41435 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
41436 return PMAdd;
41437
41438 // Try to combine truncation with signed/unsigned saturation.
41439 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
41440 return Val;
41441
41442 // Try to combine PMULHUW/PMULHW for vXi16.
41443 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
41444 return V;
41445
41446 // The bitcast source is a direct mmx result.
41447 // Detect bitcasts between i32 to x86mmx
41448 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
41449 SDValue BCSrc = Src.getOperand(0);
41450 if (BCSrc.getValueType() == MVT::x86mmx)
41451 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
41452 }
41453
41454 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
41455 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
41456 return V;
41457
41458 return combineVectorTruncation(N, DAG, Subtarget);
41459}
41460
41461static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
41462 EVT VT = N->getValueType(0);
41463 SDValue In = N->getOperand(0);
41464 SDLoc DL(N);
41465
41466 if (auto SSatVal = detectSSatPattern(In, VT))
41467 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
41468 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
41469 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
41470
41471 return SDValue();
41472}
41473
41474/// Returns the negated value if the node \p N flips sign of FP value.
41475///
41476/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
41477/// or FSUB(0, x)
41478/// AVX512F does not have FXOR, so FNEG is lowered as
41479/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
41480/// In this case we go though all bitcasts.
41481/// This also recognizes splat of a negated value and returns the splat of that
41482/// value.
41483static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
41484 if (N->getOpcode() == ISD::FNEG)
41485 return N->getOperand(0);
41486
41487 // Don't recurse exponentially.
41488 if (Depth > SelectionDAG::MaxRecursionDepth)
41489 return SDValue();
41490
41491 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
41492
41493 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
41494 EVT VT = Op->getValueType(0);
41495
41496 // Make sure the element size doesn't change.
41497 if (VT.getScalarSizeInBits() != ScalarSize)
41498 return SDValue();
41499
41500 unsigned Opc = Op.getOpcode();
41501 switch (Opc) {
41502 case ISD::VECTOR_SHUFFLE: {
41503 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
41504 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
41505 if (!Op.getOperand(1).isUndef())
41506 return SDValue();
41507 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
41508 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
41509 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
41510 cast<ShuffleVectorSDNode>(Op)->getMask());
41511 break;
41512 }
41513 case ISD::INSERT_VECTOR_ELT: {
41514 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
41515 // -V, INDEX).
41516 SDValue InsVector = Op.getOperand(0);
41517 SDValue InsVal = Op.getOperand(1);
41518 if (!InsVector.isUndef())
41519 return SDValue();
41520 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
41521 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
41522 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
41523 NegInsVal, Op.getOperand(2));
41524 break;
41525 }
41526 case ISD::FSUB:
41527 case ISD::XOR:
41528 case X86ISD::FXOR: {
41529 SDValue Op1 = Op.getOperand(1);
41530 SDValue Op0 = Op.getOperand(0);
41531
41532 // For XOR and FXOR, we want to check if constant
41533 // bits of Op1 are sign bit masks. For FSUB, we
41534 // have to check if constant bits of Op0 are sign
41535 // bit masks and hence we swap the operands.
41536 if (Opc == ISD::FSUB)
41537 std::swap(Op0, Op1);
41538
41539 APInt UndefElts;
41540 SmallVector<APInt, 16> EltBits;
41541 // Extract constant bits and see if they are all
41542 // sign bit masks. Ignore the undef elements.
41543 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
41544 /* AllowWholeUndefs */ true,
41545 /* AllowPartialUndefs */ false)) {
41546 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
41547 if (!UndefElts[I] && !EltBits[I].isSignMask())
41548 return SDValue();
41549
41550 return peekThroughBitcasts(Op0);
41551 }
41552 }
41553 }
41554
41555 return SDValue();
41556}
41557
41558static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
41559 bool NegRes) {
41560 if (NegMul) {
41561 switch (Opcode) {
41562 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41562)
;
41563 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
41564 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
41565 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
41566 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
41567 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
41568 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
41569 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
41570 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
41571 }
41572 }
41573
41574 if (NegAcc) {
41575 switch (Opcode) {
41576 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41576)
;
41577 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
41578 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
41579 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
41580 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
41581 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
41582 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
41583 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
41584 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
41585 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
41586 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
41587 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
41588 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
41589 }
41590 }
41591
41592 if (NegRes) {
41593 switch (Opcode) {
41594 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41594)
;
41595 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
41596 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
41597 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
41598 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
41599 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
41600 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
41601 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
41602 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
41603 }
41604 }
41605
41606 return Opcode;
41607}
41608
41609/// Do target-specific dag combines on floating point negations.
41610static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
41611 const X86Subtarget &Subtarget) {
41612 EVT OrigVT = N->getValueType(0);
41613 SDValue Arg = isFNEG(DAG, N);
41614 if (!Arg)
41615 return SDValue();
41616
41617 EVT VT = Arg.getValueType();
41618 EVT SVT = VT.getScalarType();
41619 SDLoc DL(N);
41620
41621 // Let legalize expand this if it isn't a legal type yet.
41622 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41623 return SDValue();
41624
41625 // If we're negating a FMUL node on a target with FMA, then we can avoid the
41626 // use of a constant by performing (-0 - A*B) instead.
41627 // FIXME: Check rounding control flags as well once it becomes available.
41628 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
41629 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
41630 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
41631 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
41632 Arg.getOperand(1), Zero);
41633 return DAG.getBitcast(OrigVT, NewNode);
41634 }
41635
41636 // If we're negating an FMA node, then we can adjust the
41637 // instruction to include the extra negation.
41638 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
41639 switch (Arg.getOpcode()) {
41640 case ISD::FMA:
41641 case X86ISD::FMSUB:
41642 case X86ISD::FNMADD:
41643 case X86ISD::FNMSUB:
41644 case X86ISD::FMADD_RND:
41645 case X86ISD::FMSUB_RND:
41646 case X86ISD::FNMADD_RND:
41647 case X86ISD::FNMSUB_RND: {
41648 // We can't handle scalar intrinsic node here because it would only
41649 // invert one element and not the whole vector. But we could try to handle
41650 // a negation of the lower element only.
41651 unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
41652 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
41653 }
41654 }
41655 }
41656
41657 return SDValue();
41658}
41659
41660char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
41661 bool LegalOperations,
41662 bool ForCodeSize,
41663 unsigned Depth) const {
41664 // fneg patterns are removable even if they have multiple uses.
41665 if (isFNEG(DAG, Op.getNode(), Depth))
41666 return 2;
41667
41668 // Don't recurse exponentially.
41669 if (Depth > SelectionDAG::MaxRecursionDepth)
41670 return 0;
41671
41672 EVT VT = Op.getValueType();
41673 EVT SVT = VT.getScalarType();
41674 switch (Op.getOpcode()) {
41675 case ISD::FMA:
41676 case X86ISD::FMSUB:
41677 case X86ISD::FNMADD:
41678 case X86ISD::FNMSUB:
41679 case X86ISD::FMADD_RND:
41680 case X86ISD::FMSUB_RND:
41681 case X86ISD::FNMADD_RND:
41682 case X86ISD::FNMSUB_RND: {
41683 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
41684 !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
41685 break;
41686
41687 // This is always negatible for free but we might be able to remove some
41688 // extra operand negations as well.
41689 for (int i = 0; i != 3; ++i) {
41690 char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
41691 ForCodeSize, Depth + 1);
41692 if (V == 2)
41693 return V;
41694 }
41695 return 1;
41696 }
41697 }
41698
41699 return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
41700 ForCodeSize, Depth);
41701}
41702
41703SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
41704 bool LegalOperations,
41705 bool ForCodeSize,
41706 unsigned Depth) const {
41707 // fneg patterns are removable even if they have multiple uses.
41708 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
41709 return DAG.getBitcast(Op.getValueType(), Arg);
41710
41711 EVT VT = Op.getValueType();
41712 EVT SVT = VT.getScalarType();
41713 unsigned Opc = Op.getOpcode();
41714 switch (Opc) {
41715 case ISD::FMA:
41716 case X86ISD::FMSUB:
41717 case X86ISD::FNMADD:
41718 case X86ISD::FNMSUB:
41719 case X86ISD::FMADD_RND:
41720 case X86ISD::FMSUB_RND:
41721 case X86ISD::FNMADD_RND:
41722 case X86ISD::FNMSUB_RND: {
41723 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
41724 !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
41725 break;
41726
41727 // This is always negatible for free but we might be able to remove some
41728 // extra operand negations as well.
41729 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
41730 for (int i = 0; i != 3; ++i) {
41731 char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
41732 ForCodeSize, Depth + 1);
41733 if (V == 2)
41734 NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
41735 ForCodeSize, Depth + 1);
41736 }
41737
41738 bool NegA = !!NewOps[0];
41739 bool NegB = !!NewOps[1];
41740 bool NegC = !!NewOps[2];
41741 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
41742
41743 // Fill in the non-negated ops with the original values.
41744 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
41745 if (!NewOps[i])
41746 NewOps[i] = Op.getOperand(i);
41747 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
41748 }
41749 }
41750
41751 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
41752 ForCodeSize, Depth);
41753}
41754
41755static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
41756 const X86Subtarget &Subtarget) {
41757 MVT VT = N->getSimpleValueType(0);
41758 // If we have integer vector types available, use the integer opcodes.
41759 if (!VT.isVector() || !Subtarget.hasSSE2())
41760 return SDValue();
41761
41762 SDLoc dl(N);
41763
41764 unsigned IntBits = VT.getScalarSizeInBits();
41765 MVT IntSVT = MVT::getIntegerVT(IntBits);
41766 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
41767
41768 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
41769 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
41770 unsigned IntOpcode;
41771 switch (N->getOpcode()) {
41772 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41772)
;
41773 case X86ISD::FOR: IntOpcode = ISD::OR; break;
41774 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
41775 case X86ISD::FAND: IntOpcode = ISD::AND; break;
41776 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
41777 }
41778 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
41779 return DAG.getBitcast(VT, IntOp);
41780}
41781
41782
41783/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
41784static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
41785 if (N->getOpcode() != ISD::XOR)
41786 return SDValue();
41787
41788 SDValue LHS = N->getOperand(0);
41789 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
41790 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
41791 return SDValue();
41792
41793 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
41794 X86::CondCode(LHS->getConstantOperandVal(0)));
41795 SDLoc DL(N);
41796 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
41797}
41798
41799static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
41800 TargetLowering::DAGCombinerInfo &DCI,
41801 const X86Subtarget &Subtarget) {
41802 // If this is SSE1 only convert to FXOR to avoid scalarization.
41803 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
41804 N->getValueType(0) == MVT::v4i32) {
41805 return DAG.getBitcast(
41806 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
41807 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
41808 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
41809 }
41810
41811 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
41812 return Cmp;
41813
41814 if (DCI.isBeforeLegalizeOps())
41815 return SDValue();
41816
41817 if (SDValue SetCC = foldXor1SetCC(N, DAG))
41818 return SetCC;
41819
41820 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
41821 return RV;
41822
41823 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
41824 return FPLogic;
41825
41826 return combineFneg(N, DAG, Subtarget);
41827}
41828
41829static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
41830 TargetLowering::DAGCombinerInfo &DCI,
41831 const X86Subtarget &Subtarget) {
41832 SDValue Op0 = N->getOperand(0);
41833 SDValue Op1 = N->getOperand(1);
41834 EVT VT = N->getValueType(0);
41835 unsigned NumBits = VT.getSizeInBits();
41836
41837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41838
41839 // TODO - Constant Folding.
41840 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41841 // Reduce Cst1 to the bottom 16-bits.
41842 // NOTE: SimplifyDemandedBits won't do this for constants.
41843 const APInt &Val1 = Cst1->getAPIntValue();
41844 APInt MaskedVal1 = Val1 & 0xFFFF;
41845 if (MaskedVal1 != Val1)
41846 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
41847 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
41848 }
41849
41850 // Only bottom 16-bits of the control bits are required.
41851 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
41852 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
41853 return SDValue(N, 0);
41854
41855 return SDValue();
41856}
41857
41858static bool isNullFPScalarOrVectorConst(SDValue V) {
41859 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
41860}
41861
41862/// If a value is a scalar FP zero or a vector FP zero (potentially including
41863/// undefined elements), return a zero constant that may be used to fold away
41864/// that value. In the case of a vector, the returned constant will not contain
41865/// undefined elements even if the input parameter does. This makes it suitable
41866/// to be used as a replacement operand with operations (eg, bitwise-and) where
41867/// an undef should not propagate.
41868static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
41869 const X86Subtarget &Subtarget) {
41870 if (!isNullFPScalarOrVectorConst(V))
41871 return SDValue();
41872
41873 if (V.getValueType().isVector())
41874 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
41875
41876 return V;
41877}
41878
41879static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
41880 const X86Subtarget &Subtarget) {
41881 SDValue N0 = N->getOperand(0);
41882 SDValue N1 = N->getOperand(1);
41883 EVT VT = N->getValueType(0);
41884 SDLoc DL(N);
41885
41886 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
41887 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
41888 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
41889 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
41890 return SDValue();
41891
41892 auto isAllOnesConstantFP = [](SDValue V) {
41893 if (V.getSimpleValueType().isVector())
41894 return ISD::isBuildVectorAllOnes(V.getNode());
41895 auto *C = dyn_cast<ConstantFPSDNode>(V);
41896 return C && C->getConstantFPValue()->isAllOnesValue();
41897 };
41898
41899 // fand (fxor X, -1), Y --> fandn X, Y
41900 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
41901 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
41902
41903 // fand X, (fxor Y, -1) --> fandn Y, X
41904 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
41905 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
41906
41907 return SDValue();
41908}
41909
41910/// Do target-specific dag combines on X86ISD::FAND nodes.
41911static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
41912 const X86Subtarget &Subtarget) {
41913 // FAND(0.0, x) -> 0.0
41914 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
41915 return V;
41916
41917 // FAND(x, 0.0) -> 0.0
41918 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
41919 return V;
41920
41921 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
41922 return V;
41923
41924 return lowerX86FPLogicOp(N, DAG, Subtarget);
41925}
41926
41927/// Do target-specific dag combines on X86ISD::FANDN nodes.
41928static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
41929 const X86Subtarget &Subtarget) {
41930 // FANDN(0.0, x) -> x
41931 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
41932 return N->getOperand(1);
41933
41934 // FANDN(x, 0.0) -> 0.0
41935 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
41936 return V;
41937
41938 return lowerX86FPLogicOp(N, DAG, Subtarget);
41939}
41940
41941/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
41942static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
41943 const X86Subtarget &Subtarget) {
41944 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41944, __PRETTY_FUNCTION__))
;
41945
41946 // F[X]OR(0.0, x) -> x
41947 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
41948 return N->getOperand(1);
41949
41950 // F[X]OR(x, 0.0) -> x
41951 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
41952 return N->getOperand(0);
41953
41954 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
41955 return NewVal;
41956
41957 return lowerX86FPLogicOp(N, DAG, Subtarget);
41958}
41959
41960/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
41961static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
41962 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41962, __PRETTY_FUNCTION__))
;
41963
41964 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
41965 if (!DAG.getTarget().Options.NoNaNsFPMath ||
41966 !DAG.getTarget().Options.NoSignedZerosFPMath)
41967 return SDValue();
41968
41969 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
41970 // into FMINC and FMAXC, which are Commutative operations.
41971 unsigned NewOp = 0;
41972 switch (N->getOpcode()) {
41973 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41973)
;
41974 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
41975 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
41976 }
41977
41978 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
41979 N->getOperand(0), N->getOperand(1));
41980}
41981
41982static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
41983 const X86Subtarget &Subtarget) {
41984 if (Subtarget.useSoftFloat())
41985 return SDValue();
41986
41987 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41988
41989 EVT VT = N->getValueType(0);
41990 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
41991 (Subtarget.hasSSE2() && VT == MVT::f64) ||
41992 (VT.isVector() && TLI.isTypeLegal(VT))))
41993 return SDValue();
41994
41995 SDValue Op0 = N->getOperand(0);
41996 SDValue Op1 = N->getOperand(1);
41997 SDLoc DL(N);
41998 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
41999
42000 // If we don't have to respect NaN inputs, this is a direct translation to x86
42001 // min/max instructions.
42002 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
42003 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
42004
42005 // If one of the operands is known non-NaN use the native min/max instructions
42006 // with the non-NaN input as second operand.
42007 if (DAG.isKnownNeverNaN(Op1))
42008 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
42009 if (DAG.isKnownNeverNaN(Op0))
42010 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
42011
42012 // If we have to respect NaN inputs, this takes at least 3 instructions.
42013 // Favor a library call when operating on a scalar and minimizing code size.
42014 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
42015 return SDValue();
42016
42017 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
42018 VT);
42019
42020 // There are 4 possibilities involving NaN inputs, and these are the required
42021 // outputs:
42022 // Op1
42023 // Num NaN
42024 // ----------------
42025 // Num | Max | Op0 |
42026 // Op0 ----------------
42027 // NaN | Op1 | NaN |
42028 // ----------------
42029 //
42030 // The SSE FP max/min instructions were not designed for this case, but rather
42031 // to implement:
42032 // Min = Op1 < Op0 ? Op1 : Op0
42033 // Max = Op1 > Op0 ? Op1 : Op0
42034 //
42035 // So they always return Op0 if either input is a NaN. However, we can still
42036 // use those instructions for fmaxnum by selecting away a NaN input.
42037
42038 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
42039 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
42040 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
42041
42042 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
42043 // are NaN, the NaN value of Op1 is the result.
42044 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
42045}
42046
42047static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
42048 TargetLowering::DAGCombinerInfo &DCI) {
42049 EVT VT = N->getValueType(0);
42050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42051
42052 APInt KnownUndef, KnownZero;
42053 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
42054 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
42055 KnownZero, DCI))
42056 return SDValue(N, 0);
42057
42058 // Convert a full vector load into vzload when not all bits are needed.
42059 SDValue In = N->getOperand(0);
42060 MVT InVT = In.getSimpleValueType();
42061 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
42062 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
42063 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42063, __PRETTY_FUNCTION__))
;
42064 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
42065 // Unless the load is volatile or atomic.
42066 if (LN->isSimple()) {
42067 SDLoc dl(N);
42068 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
42069 MVT MemVT = MVT::getIntegerVT(NumBits);
42070 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
42071 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42072 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42073 SDValue VZLoad =
42074 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
42075 LN->getPointerInfo(),
42076 LN->getAlignment(),
42077 LN->getMemOperand()->getFlags());
42078 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
42079 DAG.getBitcast(InVT, VZLoad));
42080 DCI.CombineTo(N, Convert);
42081 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42082 return SDValue(N, 0);
42083 }
42084 }
42085
42086 return SDValue();
42087}
42088
42089static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
42090 TargetLowering::DAGCombinerInfo &DCI) {
42091 EVT VT = N->getValueType(0);
42092
42093 // Convert a full vector load into vzload when not all bits are needed.
42094 SDValue In = N->getOperand(0);
42095 MVT InVT = In.getSimpleValueType();
42096 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
42097 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
42098 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42098, __PRETTY_FUNCTION__))
;
42099 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
42100 // Unless the load is volatile or atomic.
42101 if (LN->isSimple()) {
42102 SDLoc dl(N);
42103 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
42104 MVT MemVT = MVT::getFloatingPointVT(NumBits);
42105 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
42106 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42107 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42108 SDValue VZLoad =
42109 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
42110 LN->getPointerInfo(),
42111 LN->getAlignment(),
42112 LN->getMemOperand()->getFlags());
42113 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
42114 DAG.getBitcast(InVT, VZLoad));
42115 DCI.CombineTo(N, Convert);
42116 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42117 return SDValue(N, 0);
42118 }
42119 }
42120
42121 return SDValue();
42122}
42123
42124/// Do target-specific dag combines on X86ISD::ANDNP nodes.
42125static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
42126 TargetLowering::DAGCombinerInfo &DCI,
42127 const X86Subtarget &Subtarget) {
42128 MVT VT = N->getSimpleValueType(0);
42129
42130 // ANDNP(0, x) -> x
42131 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
42132 return N->getOperand(1);
42133
42134 // ANDNP(x, 0) -> 0
42135 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
42136 return DAG.getConstant(0, SDLoc(N), VT);
42137
42138 // Turn ANDNP back to AND if input is inverted.
42139 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
42140 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
42141 N->getOperand(1));
42142
42143 // Attempt to recursively combine a bitmask ANDNP with shuffles.
42144 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
42145 SDValue Op(N, 0);
42146 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42147 return Res;
42148 }
42149
42150 return SDValue();
42151}
42152
42153static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
42154 TargetLowering::DAGCombinerInfo &DCI) {
42155 SDValue N0 = N->getOperand(0);
42156 SDValue N1 = N->getOperand(1);
42157
42158 // BT ignores high bits in the bit index operand.
42159 unsigned BitWidth = N1.getValueSizeInBits();
42160 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
42161 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
42162 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
42163
42164 return SDValue();
42165}
42166
42167// Try to combine sext_in_reg of a cmov of constants by extending the constants.
42168static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
42169 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42169, __PRETTY_FUNCTION__))
;
42170
42171 EVT DstVT = N->getValueType(0);
42172
42173 SDValue N0 = N->getOperand(0);
42174 SDValue N1 = N->getOperand(1);
42175 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
42176
42177 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
42178 return SDValue();
42179
42180 // Look through single use any_extends / truncs.
42181 SDValue IntermediateBitwidthOp;
42182 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
42183 N0.hasOneUse()) {
42184 IntermediateBitwidthOp = N0;
42185 N0 = N0.getOperand(0);
42186 }
42187
42188 // See if we have a single use cmov.
42189 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
42190 return SDValue();
42191
42192 SDValue CMovOp0 = N0.getOperand(0);
42193 SDValue CMovOp1 = N0.getOperand(1);
42194
42195 // Make sure both operands are constants.
42196 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
42197 !isa<ConstantSDNode>(CMovOp1.getNode()))
42198 return SDValue();
42199
42200 SDLoc DL(N);
42201
42202 // If we looked through an any_extend/trunc above, add one to the constants.
42203 if (IntermediateBitwidthOp) {
42204 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
42205 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
42206 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
42207 }
42208
42209 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
42210 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
42211
42212 EVT CMovVT = DstVT;
42213 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
42214 if (DstVT == MVT::i16) {
42215 CMovVT = MVT::i32;
42216 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
42217 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
42218 }
42219
42220 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
42221 N0.getOperand(2), N0.getOperand(3));
42222
42223 if (CMovVT != DstVT)
42224 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
42225
42226 return CMov;
42227}
42228
42229static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
42230 const X86Subtarget &Subtarget) {
42231 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42231, __PRETTY_FUNCTION__))
;
42232
42233 if (SDValue V = combineSextInRegCmov(N, DAG))
42234 return V;
42235
42236 EVT VT = N->getValueType(0);
42237 SDValue N0 = N->getOperand(0);
42238 SDValue N1 = N->getOperand(1);
42239 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
42240 SDLoc dl(N);
42241
42242 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
42243 // both SSE and AVX2 since there is no sign-extended shift right
42244 // operation on a vector with 64-bit elements.
42245 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
42246 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
42247 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
42248 N0.getOpcode() == ISD::SIGN_EXTEND)) {
42249 SDValue N00 = N0.getOperand(0);
42250
42251 // EXTLOAD has a better solution on AVX2,
42252 // it may be replaced with X86ISD::VSEXT node.
42253 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
42254 if (!ISD::isNormalLoad(N00.getNode()))
42255 return SDValue();
42256
42257 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
42258 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
42259 N00, N1);
42260 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
42261 }
42262 }
42263 return SDValue();
42264}
42265
42266/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
42267/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
42268/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
42269/// opportunities to combine math ops, use an LEA, or use a complex addressing
42270/// mode. This can eliminate extend, add, and shift instructions.
42271static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
42272 const X86Subtarget &Subtarget) {
42273 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
42274 Ext->getOpcode() != ISD::ZERO_EXTEND)
42275 return SDValue();
42276
42277 // TODO: This should be valid for other integer types.
42278 EVT VT = Ext->getValueType(0);
42279 if (VT != MVT::i64)
42280 return SDValue();
42281
42282 SDValue Add = Ext->getOperand(0);
42283 if (Add.getOpcode() != ISD::ADD)
42284 return SDValue();
42285
42286 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
42287 bool NSW = Add->getFlags().hasNoSignedWrap();
42288 bool NUW = Add->getFlags().hasNoUnsignedWrap();
42289
42290 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
42291 // into the 'zext'
42292 if ((Sext && !NSW) || (!Sext && !NUW))
42293 return SDValue();
42294
42295 // Having a constant operand to the 'add' ensures that we are not increasing
42296 // the instruction count because the constant is extended for free below.
42297 // A constant operand can also become the displacement field of an LEA.
42298 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
42299 if (!AddOp1)
42300 return SDValue();
42301
42302 // Don't make the 'add' bigger if there's no hope of combining it with some
42303 // other 'add' or 'shl' instruction.
42304 // TODO: It may be profitable to generate simpler LEA instructions in place
42305 // of single 'add' instructions, but the cost model for selecting an LEA
42306 // currently has a high threshold.
42307 bool HasLEAPotential = false;
42308 for (auto *User : Ext->uses()) {
42309 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
42310 HasLEAPotential = true;
42311 break;
42312 }
42313 }
42314 if (!HasLEAPotential)
42315 return SDValue();
42316
42317 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
42318 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
42319 SDValue AddOp0 = Add.getOperand(0);
42320 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
42321 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
42322
42323 // The wider add is guaranteed to not wrap because both operands are
42324 // sign-extended.
42325 SDNodeFlags Flags;
42326 Flags.setNoSignedWrap(NSW);
42327 Flags.setNoUnsignedWrap(NUW);
42328 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
42329}
42330
42331// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
42332// operands and the result of CMOV is not used anywhere else - promote CMOV
42333// itself instead of promoting its result. This could be beneficial, because:
42334// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
42335// (or more) pseudo-CMOVs only when they go one-after-another and
42336// getting rid of result extension code after CMOV will help that.
42337// 2) Promotion of constant CMOV arguments is free, hence the
42338// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
42339// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
42340// promotion is also good in terms of code-size.
42341// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
42342// promotion).
42343static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
42344 SDValue CMovN = Extend->getOperand(0);
42345 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
42346 return SDValue();
42347
42348 EVT TargetVT = Extend->getValueType(0);
42349 unsigned ExtendOpcode = Extend->getOpcode();
42350 SDLoc DL(Extend);
42351
42352 EVT VT = CMovN.getValueType();
42353 SDValue CMovOp0 = CMovN.getOperand(0);
42354 SDValue CMovOp1 = CMovN.getOperand(1);
42355
42356 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
42357 !isa<ConstantSDNode>(CMovOp1.getNode()))
42358 return SDValue();
42359
42360 // Only extend to i32 or i64.
42361 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
42362 return SDValue();
42363
42364 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
42365 // are free.
42366 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
42367 return SDValue();
42368
42369 // If this a zero extend to i64, we should only extend to i32 and use a free
42370 // zero extend to finish.
42371 EVT ExtendVT = TargetVT;
42372 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
42373 ExtendVT = MVT::i32;
42374
42375 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
42376 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
42377
42378 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
42379 CMovN.getOperand(2), CMovN.getOperand(3));
42380
42381 // Finish extending if needed.
42382 if (ExtendVT != TargetVT)
42383 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
42384
42385 return Res;
42386}
42387
42388// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
42389// This is more or less the reverse of combineBitcastvxi1.
42390static SDValue
42391combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
42392 TargetLowering::DAGCombinerInfo &DCI,
42393 const X86Subtarget &Subtarget) {
42394 unsigned Opcode = N->getOpcode();
42395 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
42396 Opcode != ISD::ANY_EXTEND)
42397 return SDValue();
42398 if (!DCI.isBeforeLegalizeOps())
42399 return SDValue();
42400 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
42401 return SDValue();
42402
42403 SDValue N0 = N->getOperand(0);
42404 EVT VT = N->getValueType(0);
42405 EVT SVT = VT.getScalarType();
42406 EVT InSVT = N0.getValueType().getScalarType();
42407 unsigned EltSizeInBits = SVT.getSizeInBits();
42408
42409 // Input type must be extending a bool vector (bit-casted from a scalar
42410 // integer) to legal integer types.
42411 if (!VT.isVector())
42412 return SDValue();
42413 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
42414 return SDValue();
42415 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
42416 return SDValue();
42417
42418 SDValue N00 = N0.getOperand(0);
42419 EVT SclVT = N0.getOperand(0).getValueType();
42420 if (!SclVT.isScalarInteger())
42421 return SDValue();
42422
42423 SDLoc DL(N);
42424 SDValue Vec;
42425 SmallVector<int, 32> ShuffleMask;
42426 unsigned NumElts = VT.getVectorNumElements();
42427 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42427, __PRETTY_FUNCTION__))
;
42428
42429 // Broadcast the scalar integer to the vector elements.
42430 if (NumElts > EltSizeInBits) {
42431 // If the scalar integer is greater than the vector element size, then we
42432 // must split it down into sub-sections for broadcasting. For example:
42433 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
42434 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
42435 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42435, __PRETTY_FUNCTION__))
;
42436 unsigned Scale = NumElts / EltSizeInBits;
42437 EVT BroadcastVT =
42438 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
42439 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
42440 Vec = DAG.getBitcast(VT, Vec);
42441
42442 for (unsigned i = 0; i != Scale; ++i)
42443 ShuffleMask.append(EltSizeInBits, i);
42444 } else {
42445 // For smaller scalar integers, we can simply any-extend it to the vector
42446 // element size (we don't care about the upper bits) and broadcast it to all
42447 // elements.
42448 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
42449 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42450 ShuffleMask.append(NumElts, 0);
42451 }
42452 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
42453
42454 // Now, mask the relevant bit in each element.
42455 SmallVector<SDValue, 32> Bits;
42456 for (unsigned i = 0; i != NumElts; ++i) {
42457 int BitIdx = (i % EltSizeInBits);
42458 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
42459 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
42460 }
42461 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
42462 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
42463
42464 // Compare against the bitmask and extend the result.
42465 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
42466 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
42467 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
42468
42469 // For SEXT, this is now done, otherwise shift the result down for
42470 // zero-extension.
42471 if (Opcode == ISD::SIGN_EXTEND)
42472 return Vec;
42473 return DAG.getNode(ISD::SRL, DL, VT, Vec,
42474 DAG.getConstant(EltSizeInBits - 1, DL, VT));
42475}
42476
42477// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
42478// result type.
42479static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
42480 const X86Subtarget &Subtarget) {
42481 SDValue N0 = N->getOperand(0);
42482 EVT VT = N->getValueType(0);
42483 SDLoc dl(N);
42484
42485 // Only do this combine with AVX512 for vector extends.
42486 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
42487 return SDValue();
42488
42489 // Only combine legal element types.
42490 EVT SVT = VT.getVectorElementType();
42491 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
42492 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
42493 return SDValue();
42494
42495 // We can only do this if the vector size in 256 bits or less.
42496 unsigned Size = VT.getSizeInBits();
42497 if (Size > 256)
42498 return SDValue();
42499
42500 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
42501 // that's the only integer compares with we have.
42502 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
42503 if (ISD::isUnsignedIntSetCC(CC))
42504 return SDValue();
42505
42506 // Only do this combine if the extension will be fully consumed by the setcc.
42507 EVT N00VT = N0.getOperand(0).getValueType();
42508 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
42509 if (Size != MatchingVecType.getSizeInBits())
42510 return SDValue();
42511
42512 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
42513
42514 if (N->getOpcode() == ISD::ZERO_EXTEND)
42515 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
42516
42517 return Res;
42518}
42519
42520static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
42521 TargetLowering::DAGCombinerInfo &DCI,
42522 const X86Subtarget &Subtarget) {
42523 SDValue N0 = N->getOperand(0);
42524 EVT VT = N->getValueType(0);
42525 EVT InVT = N0.getValueType();
42526 SDLoc DL(N);
42527
42528 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
42529 return NewCMov;
42530
42531 if (!DCI.isBeforeLegalizeOps())
42532 return SDValue();
42533
42534 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
42535 return V;
42536
42537 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
42538 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
42539 // Invert and sign-extend a boolean is the same as zero-extend and subtract
42540 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
42541 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
42542 // sext (xor Bool, -1) --> sub (zext Bool), 1
42543 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
42544 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
42545 }
42546
42547 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
42548 return V;
42549
42550 if (VT.isVector())
42551 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
42552 return R;
42553
42554 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
42555 return NewAdd;
42556
42557 return SDValue();
42558}
42559
42560static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
42561 TargetLowering::DAGCombinerInfo &DCI,
42562 const X86Subtarget &Subtarget) {
42563 SDLoc dl(N);
42564 EVT VT = N->getValueType(0);
42565
42566 // Let legalize expand this if it isn't a legal type yet.
42567 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42568 if (!TLI.isTypeLegal(VT))
42569 return SDValue();
42570
42571 EVT ScalarVT = VT.getScalarType();
42572 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
42573 return SDValue();
42574
42575 SDValue A = N->getOperand(0);
42576 SDValue B = N->getOperand(1);
42577 SDValue C = N->getOperand(2);
42578
42579 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
42580 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
42581 bool LegalOperations = !DCI.isBeforeLegalizeOps();
42582 if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
42583 V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
42584 return true;
42585 }
42586 // Look through extract_vector_elts. If it comes from an FNEG, create a
42587 // new extract from the FNEG input.
42588 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42589 isNullConstant(V.getOperand(1))) {
42590 SDValue Vec = V.getOperand(0);
42591 if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
42592 SDValue NegVal =
42593 TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
42594 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
42595 NegVal, V.getOperand(1));
42596 return true;
42597 }
42598 }
42599
42600 return false;
42601 };
42602
42603 // Do not convert the passthru input of scalar intrinsics.
42604 // FIXME: We could allow negations of the lower element only.
42605 bool NegA = invertIfNegative(A);
42606 bool NegB = invertIfNegative(B);
42607 bool NegC = invertIfNegative(C);
42608
42609 if (!NegA && !NegB && !NegC)
42610 return SDValue();
42611
42612 unsigned NewOpcode =
42613 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
42614
42615 if (N->getNumOperands() == 4)
42616 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
42617 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
42618}
42619
42620// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
42621// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
42622static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
42623 TargetLowering::DAGCombinerInfo &DCI) {
42624 SDLoc dl(N);
42625 EVT VT = N->getValueType(0);
42626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42627 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
42628 bool LegalOperations = !DCI.isBeforeLegalizeOps();
42629
42630 SDValue N2 = N->getOperand(2);
42631 if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
42632 return SDValue();
42633
42634 SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
42635 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
42636
42637 if (N->getNumOperands() == 4)
42638 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
42639 NegN2, N->getOperand(3));
42640 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
42641 NegN2);
42642}
42643
42644static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
42645 TargetLowering::DAGCombinerInfo &DCI,
42646 const X86Subtarget &Subtarget) {
42647 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
42648 // (and (i32 x86isd::setcc_carry), 1)
42649 // This eliminates the zext. This transformation is necessary because
42650 // ISD::SETCC is always legalized to i8.
42651 SDLoc dl(N);
42652 SDValue N0 = N->getOperand(0);
42653 EVT VT = N->getValueType(0);
42654
42655 if (N0.getOpcode() == ISD::AND &&
42656 N0.hasOneUse() &&
42657 N0.getOperand(0).hasOneUse()) {
42658 SDValue N00 = N0.getOperand(0);
42659 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
42660 if (!isOneConstant(N0.getOperand(1)))
42661 return SDValue();
42662 return DAG.getNode(ISD::AND, dl, VT,
42663 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
42664 N00.getOperand(0), N00.getOperand(1)),
42665 DAG.getConstant(1, dl, VT));
42666 }
42667 }
42668
42669 if (N0.getOpcode() == ISD::TRUNCATE &&
42670 N0.hasOneUse() &&
42671 N0.getOperand(0).hasOneUse()) {
42672 SDValue N00 = N0.getOperand(0);
42673 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
42674 return DAG.getNode(ISD::AND, dl, VT,
42675 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
42676 N00.getOperand(0), N00.getOperand(1)),
42677 DAG.getConstant(1, dl, VT));
42678 }
42679 }
42680
42681 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
42682 return NewCMov;
42683
42684 if (DCI.isBeforeLegalizeOps())
42685 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
42686 return V;
42687
42688 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
42689 return V;
42690
42691 if (VT.isVector())
42692 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
42693 return R;
42694
42695 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
42696 return NewAdd;
42697
42698 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
42699 return R;
42700
42701 // TODO: Combine with any target/faux shuffle.
42702 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
42703 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
42704 SDValue N00 = N0.getOperand(0);
42705 SDValue N01 = N0.getOperand(1);
42706 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
42707 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
42708 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
42709 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
42710 return concatSubVectors(N00, N01, DAG, dl);
42711 }
42712 }
42713
42714 return SDValue();
42715}
42716
42717/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
42718/// recognizable memcmp expansion.
42719static bool isOrXorXorTree(SDValue X, bool Root = true) {
42720 if (X.getOpcode() == ISD::OR)
42721 return isOrXorXorTree(X.getOperand(0), false) &&
42722 isOrXorXorTree(X.getOperand(1), false);
42723 if (Root)
42724 return false;
42725 return X.getOpcode() == ISD::XOR;
42726}
42727
42728/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
42729/// expansion.
42730template<typename F>
42731static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
42732 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
42733 SDValue Op0 = X.getOperand(0);
42734 SDValue Op1 = X.getOperand(1);
42735 if (X.getOpcode() == ISD::OR) {
42736 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
42737 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
42738 if (VecVT != CmpVT)
42739 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
42740 if (HasPT)
42741 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
42742 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
42743 } else if (X.getOpcode() == ISD::XOR) {
42744 SDValue A = SToV(Op0);
42745 SDValue B = SToV(Op1);
42746 if (VecVT != CmpVT)
42747 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
42748 if (HasPT)
42749 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
42750 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
42751 }
42752 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42752)
;
42753}
42754
42755/// Try to map a 128-bit or larger integer comparison to vector instructions
42756/// before type legalization splits it up into chunks.
42757static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
42758 const X86Subtarget &Subtarget) {
42759 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
42760 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42760, __PRETTY_FUNCTION__))
;
42761
42762 // We're looking for an oversized integer equality comparison.
42763 SDValue X = SetCC->getOperand(0);
42764 SDValue Y = SetCC->getOperand(1);
42765 EVT OpVT = X.getValueType();
42766 unsigned OpSize = OpVT.getSizeInBits();
42767 if (!OpVT.isScalarInteger() || OpSize < 128)
42768 return SDValue();
42769
42770 // Ignore a comparison with zero because that gets special treatment in
42771 // EmitTest(). But make an exception for the special case of a pair of
42772 // logically-combined vector-sized operands compared to zero. This pattern may
42773 // be generated by the memcmp expansion pass with oversized integer compares
42774 // (see PR33325).
42775 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
42776 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
42777 return SDValue();
42778
42779 // Don't perform this combine if constructing the vector will be expensive.
42780 auto IsVectorBitCastCheap = [](SDValue X) {
42781 X = peekThroughBitcasts(X);
42782 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
42783 X.getOpcode() == ISD::LOAD;
42784 };
42785 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
42786 !IsOrXorXorTreeCCZero)
42787 return SDValue();
42788
42789 EVT VT = SetCC->getValueType(0);
42790 SDLoc DL(SetCC);
42791 bool HasAVX = Subtarget.hasAVX();
42792
42793 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
42794 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
42795 // Otherwise use PCMPEQ (plus AND) and mask testing.
42796 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
42797 (OpSize == 256 && HasAVX) ||
42798 (OpSize == 512 && Subtarget.useAVX512Regs())) {
42799 bool HasPT = Subtarget.hasSSE41();
42800
42801 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
42802 // vector registers are essentially free. (Technically, widening registers
42803 // prevents load folding, but the tradeoff is worth it.)
42804 bool PreferKOT = Subtarget.preferMaskRegisters();
42805 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
42806
42807 EVT VecVT = MVT::v16i8;
42808 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
42809 if (OpSize == 256) {
42810 VecVT = MVT::v32i8;
42811 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
42812 }
42813 EVT CastVT = VecVT;
42814 bool NeedsAVX512FCast = false;
42815 if (OpSize == 512 || NeedZExt) {
42816 if (Subtarget.hasBWI()) {
42817 VecVT = MVT::v64i8;
42818 CmpVT = MVT::v64i1;
42819 if (OpSize == 512)
42820 CastVT = VecVT;
42821 } else {
42822 VecVT = MVT::v16i32;
42823 CmpVT = MVT::v16i1;
42824 CastVT = OpSize == 512 ? VecVT :
42825 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
42826 NeedsAVX512FCast = true;
42827 }
42828 }
42829
42830 auto ScalarToVector = [&](SDValue X) -> SDValue {
42831 bool TmpZext = false;
42832 EVT TmpCastVT = CastVT;
42833 if (X.getOpcode() == ISD::ZERO_EXTEND) {
42834 SDValue OrigX = X.getOperand(0);
42835 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
42836 if (OrigSize < OpSize) {
42837 if (OrigSize == 128) {
42838 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
42839 X = OrigX;
42840 TmpZext = true;
42841 } else if (OrigSize == 256) {
42842 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
42843 X = OrigX;
42844 TmpZext = true;
42845 }
42846 }
42847 }
42848 X = DAG.getBitcast(TmpCastVT, X);
42849 if (!NeedZExt && !TmpZext)
42850 return X;
42851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42852 MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
42853 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
42854 DAG.getConstant(0, DL, VecVT), X,
42855 DAG.getConstant(0, DL, VecIdxVT));
42856 };
42857
42858 SDValue Cmp;
42859 if (IsOrXorXorTreeCCZero) {
42860 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
42861 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
42862 // Use 2 vector equality compares and 'and' the results before doing a
42863 // MOVMSK.
42864 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
42865 } else {
42866 SDValue VecX = ScalarToVector(X);
42867 SDValue VecY = ScalarToVector(Y);
42868 if (VecVT != CmpVT) {
42869 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
42870 } else if (HasPT) {
42871 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
42872 } else {
42873 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
42874 }
42875 }
42876 // AVX512 should emit a setcc that will lower to kortest.
42877 if (VecVT != CmpVT) {
42878 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
42879 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
42880 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
42881 DAG.getConstant(0, DL, KRegVT), CC);
42882 }
42883 if (HasPT) {
42884 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
42885 Cmp);
42886 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
42887 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
42888 SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
42889 return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
42890 }
42891 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
42892 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
42893 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
42894 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
42895 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
42896 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
42897 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
42898 MVT::i32);
42899 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
42900 }
42901
42902 return SDValue();
42903}
42904
42905static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
42906 const X86Subtarget &Subtarget) {
42907 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
42908 SDValue LHS = N->getOperand(0);
42909 SDValue RHS = N->getOperand(1);
42910 EVT VT = N->getValueType(0);
42911 EVT OpVT = LHS.getValueType();
42912 SDLoc DL(N);
42913
42914 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
42915 // 0-x == y --> x+y == 0
42916 // 0-x != y --> x+y != 0
42917 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
42918 LHS.hasOneUse()) {
42919 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
42920 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
42921 }
42922 // x == 0-y --> x+y == 0
42923 // x != 0-y --> x+y != 0
42924 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
42925 RHS.hasOneUse()) {
42926 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
42927 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
42928 }
42929
42930 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
42931 return V;
42932 }
42933
42934 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42935 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
42936 // Put build_vectors on the right.
42937 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
42938 std::swap(LHS, RHS);
42939 CC = ISD::getSetCCSwappedOperands(CC);
42940 }
42941
42942 bool IsSEXT0 =
42943 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
42944 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
42945 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
42946
42947 if (IsSEXT0 && IsVZero1) {
42948 assert(VT == LHS.getOperand(0).getValueType() &&((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42949, __PRETTY_FUNCTION__))
42949 "Uexpected operand type")((VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == LHS.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42949, __PRETTY_FUNCTION__))
;
42950 if (CC == ISD::SETGT)
42951 return DAG.getConstant(0, DL, VT);
42952 if (CC == ISD::SETLE)
42953 return DAG.getConstant(1, DL, VT);
42954 if (CC == ISD::SETEQ || CC == ISD::SETGE)
42955 return DAG.getNOT(DL, LHS.getOperand(0), VT);
42956
42957 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&(((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42958, __PRETTY_FUNCTION__))
42958 "Unexpected condition code!")(((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42958, __PRETTY_FUNCTION__))
;
42959 return LHS.getOperand(0);
42960 }
42961 }
42962
42963 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
42964 // pre-promote its result type since vXi1 vectors don't get promoted
42965 // during type legalization.
42966 // NOTE: The element count check is to ignore operand types that need to
42967 // go through type promotion to a 128-bit vector.
42968 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
42969 VT.getVectorElementType() == MVT::i1 &&
42970 (OpVT.getVectorElementType() == MVT::i8 ||
42971 OpVT.getVectorElementType() == MVT::i16)) {
42972 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
42973 N->getOperand(2));
42974 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
42975 }
42976
42977 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
42978 // to avoid scalarization via legalization because v4i32 is not a legal type.
42979 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
42980 LHS.getValueType() == MVT::v4f32)
42981 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
42982
42983 return SDValue();
42984}
42985
42986static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
42987 TargetLowering::DAGCombinerInfo &DCI,
42988 const X86Subtarget &Subtarget) {
42989 SDValue Src = N->getOperand(0);
42990 MVT SrcVT = Src.getSimpleValueType();
42991 MVT VT = N->getSimpleValueType(0);
42992 unsigned NumBits = VT.getScalarSizeInBits();
42993 unsigned NumElts = SrcVT.getVectorNumElements();
42994
42995 // Perform constant folding.
42996 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
42997 assert(VT == MVT::i32 && "Unexpected result type")((VT == MVT::i32 && "Unexpected result type") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42997, __PRETTY_FUNCTION__))
;
42998 APInt Imm(32, 0);
42999 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
43000 if (!Src.getOperand(Idx).isUndef() &&
43001 Src.getConstantOperandAPInt(Idx).isNegative())
43002 Imm.setBit(Idx);
43003 }
43004 return DAG.getConstant(Imm, SDLoc(N), VT);
43005 }
43006
43007 // Look through int->fp bitcasts that don't change the element width.
43008 unsigned EltWidth = SrcVT.getScalarSizeInBits();
43009 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
43010 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
43011 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
43012
43013 // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
43014 // with scalar comparisons.
43015 if (SDValue NotSrc = IsNOT(Src, DAG)) {
43016 SDLoc DL(N);
43017 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
43018 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
43019 return DAG.getNode(ISD::XOR, DL, VT,
43020 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
43021 DAG.getConstant(NotMask, DL, VT));
43022 }
43023
43024 // Simplify the inputs.
43025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43026 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
43027 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
43028 return SDValue(N, 0);
43029
43030 return SDValue();
43031}
43032
43033static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
43034 TargetLowering::DAGCombinerInfo &DCI) {
43035 // With vector masks we only demand the upper bit of the mask.
43036 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
43037 if (Mask.getScalarValueSizeInBits() != 1) {
43038 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43039 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
43040 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
43041 return SDValue(N, 0);
43042 }
43043
43044 return SDValue();
43045}
43046
43047static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
43048 TargetLowering::DAGCombinerInfo &DCI) {
43049 SDLoc DL(N);
43050 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
43051 SDValue Chain = GorS->getChain();
43052 SDValue Index = GorS->getIndex();
43053 SDValue Mask = GorS->getMask();
43054 SDValue Base = GorS->getBasePtr();
43055 SDValue Scale = GorS->getScale();
43056
43057 if (DCI.isBeforeLegalize()) {
43058 unsigned IndexWidth = Index.getScalarValueSizeInBits();
43059
43060 // Shrink constant indices if they are larger than 32-bits.
43061 // Only do this before legalize types since v2i64 could become v2i32.
43062 // FIXME: We could check that the type is legal if we're after legalize
43063 // types, but then we would need to construct test cases where that happens.
43064 // FIXME: We could support more than just constant vectors, but we need to
43065 // careful with costing. A truncate that can be optimized out would be fine.
43066 // Otherwise we might only want to create a truncate if it avoids a split.
43067 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
43068 if (BV->isConstant() && IndexWidth > 32 &&
43069 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
43070 unsigned NumElts = Index.getValueType().getVectorNumElements();
43071 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
43072 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
43073 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
43074 SDValue Ops[] = { Chain, Gather->getPassThru(),
43075 Mask, Base, Index, Scale } ;
43076 return DAG.getMaskedGather(Gather->getVTList(),
43077 Gather->getMemoryVT(), DL, Ops,
43078 Gather->getMemOperand(),
43079 Gather->getIndexType());
43080 }
43081 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
43082 SDValue Ops[] = { Chain, Scatter->getValue(),
43083 Mask, Base, Index, Scale };
43084 return DAG.getMaskedScatter(Scatter->getVTList(),
43085 Scatter->getMemoryVT(), DL,
43086 Ops, Scatter->getMemOperand(),
43087 Scatter->getIndexType());
43088 }
43089 }
43090
43091 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
43092 // there are sufficient sign bits. Only do this before legalize types to
43093 // avoid creating illegal types in truncate.
43094 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
43095 Index.getOpcode() == ISD::ZERO_EXTEND) &&
43096 IndexWidth > 32 &&
43097 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
43098 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
43099 unsigned NumElts = Index.getValueType().getVectorNumElements();
43100 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
43101 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
43102 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
43103 SDValue Ops[] = { Chain, Gather->getPassThru(),
43104 Mask, Base, Index, Scale } ;
43105 return DAG.getMaskedGather(Gather->getVTList(),
43106 Gather->getMemoryVT(), DL, Ops,
43107 Gather->getMemOperand(),
43108 Gather->getIndexType());
43109 }
43110 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
43111 SDValue Ops[] = { Chain, Scatter->getValue(),
43112 Mask, Base, Index, Scale };
43113 return DAG.getMaskedScatter(Scatter->getVTList(),
43114 Scatter->getMemoryVT(), DL,
43115 Ops, Scatter->getMemOperand(),
43116 Scatter->getIndexType());
43117 }
43118 }
43119
43120 if (DCI.isBeforeLegalizeOps()) {
43121 unsigned IndexWidth = Index.getScalarValueSizeInBits();
43122
43123 // Make sure the index is either i32 or i64
43124 if (IndexWidth != 32 && IndexWidth != 64) {
43125 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
43126 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
43127 Index.getValueType().getVectorNumElements());
43128 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
43129 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
43130 SDValue Ops[] = { Chain, Gather->getPassThru(),
43131 Mask, Base, Index, Scale } ;
43132 return DAG.getMaskedGather(Gather->getVTList(),
43133 Gather->getMemoryVT(), DL, Ops,
43134 Gather->getMemOperand(),
43135 Gather->getIndexType());
43136 }
43137 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
43138 SDValue Ops[] = { Chain, Scatter->getValue(),
43139 Mask, Base, Index, Scale };
43140 return DAG.getMaskedScatter(Scatter->getVTList(),
43141 Scatter->getMemoryVT(), DL,
43142 Ops, Scatter->getMemOperand(),
43143 Scatter->getIndexType());
43144 }
43145 }
43146
43147 // With vector masks we only demand the upper bit of the mask.
43148 if (Mask.getScalarValueSizeInBits() != 1) {
43149 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43150 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
43151 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
43152 return SDValue(N, 0);
43153 }
43154
43155 return SDValue();
43156}
43157
43158// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
43159static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
43160 const X86Subtarget &Subtarget) {
43161 SDLoc DL(N);
43162 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
43163 SDValue EFLAGS = N->getOperand(1);
43164
43165 // Try to simplify the EFLAGS and condition code operands.
43166 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
43167 return getSETCC(CC, Flags, DL, DAG);
43168
43169 return SDValue();
43170}
43171
43172/// Optimize branch condition evaluation.
43173static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
43174 const X86Subtarget &Subtarget) {
43175 SDLoc DL(N);
43176 SDValue EFLAGS = N->getOperand(3);
43177 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
43178
43179 // Try to simplify the EFLAGS and condition code operands.
43180 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
43181 // RAUW them under us.
43182 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
43183 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
43184 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
43185 N->getOperand(1), Cond, Flags);
43186 }
43187
43188 return SDValue();
43189}
43190
43191static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
43192 SelectionDAG &DAG) {
43193 // Take advantage of vector comparisons producing 0 or -1 in each lane to
43194 // optimize away operation when it's from a constant.
43195 //
43196 // The general transformation is:
43197 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
43198 // AND(VECTOR_CMP(x,y), constant2)
43199 // constant2 = UNARYOP(constant)
43200
43201 // Early exit if this isn't a vector operation, the operand of the
43202 // unary operation isn't a bitwise AND, or if the sizes of the operations
43203 // aren't the same.
43204 EVT VT = N->getValueType(0);
43205 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
43206 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
43207 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
43208 return SDValue();
43209
43210 // Now check that the other operand of the AND is a constant. We could
43211 // make the transformation for non-constant splats as well, but it's unclear
43212 // that would be a benefit as it would not eliminate any operations, just
43213 // perform one more step in scalar code before moving to the vector unit.
43214 if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
43215 // Bail out if the vector isn't a constant.
43216 if (!BV->isConstant())
43217 return SDValue();
43218
43219 // Everything checks out. Build up the new and improved node.
43220 SDLoc DL(N);
43221 EVT IntVT = BV->getValueType(0);
43222 // Create a new constant of the appropriate type for the transformed
43223 // DAG.
43224 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
43225 // The AND node needs bitcasts to/from an integer vector type around it.
43226 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
43227 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
43228 N->getOperand(0)->getOperand(0), MaskConst);
43229 SDValue Res = DAG.getBitcast(VT, NewAnd);
43230 return Res;
43231 }
43232
43233 return SDValue();
43234}
43235
43236/// If we are converting a value to floating-point, try to replace scalar
43237/// truncate of an extracted vector element with a bitcast. This tries to keep
43238/// the sequence on XMM registers rather than moving between vector and GPRs.
43239static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
43240 // TODO: This is currently only used by combineSIntToFP, but it is generalized
43241 // to allow being called by any similar cast opcode.
43242 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
43243 SDValue Trunc = N->getOperand(0);
43244 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
43245 return SDValue();
43246
43247 SDValue ExtElt = Trunc.getOperand(0);
43248 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43249 !isNullConstant(ExtElt.getOperand(1)))
43250 return SDValue();
43251
43252 EVT TruncVT = Trunc.getValueType();
43253 EVT SrcVT = ExtElt.getValueType();
43254 unsigned DestWidth = TruncVT.getSizeInBits();
43255 unsigned SrcWidth = SrcVT.getSizeInBits();
43256 if (SrcWidth % DestWidth != 0)
43257 return SDValue();
43258
43259 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
43260 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
43261 unsigned VecWidth = SrcVecVT.getSizeInBits();
43262 unsigned NumElts = VecWidth / DestWidth;
43263 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
43264 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
43265 SDLoc DL(N);
43266 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
43267 BitcastVec, ExtElt.getOperand(1));
43268 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
43269}
43270
43271static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
43272 const X86Subtarget &Subtarget) {
43273 SDValue Op0 = N->getOperand(0);
43274 EVT VT = N->getValueType(0);
43275 EVT InVT = Op0.getValueType();
43276
43277 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
43278 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
43279 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
43280 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
43281 SDLoc dl(N);
43282 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
43283 InVT.getVectorNumElements());
43284 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
43285
43286 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
43287 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
43288 }
43289
43290 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
43291 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
43292 // the optimization here.
43293 if (DAG.SignBitIsZero(Op0))
43294 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
43295
43296 return SDValue();
43297}
43298
43299static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
43300 TargetLowering::DAGCombinerInfo &DCI,
43301 const X86Subtarget &Subtarget) {
43302 // First try to optimize away the conversion entirely when it's
43303 // conditionally from a constant. Vectors only.
43304 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
43305 return Res;
43306
43307 // Now move on to more general possibilities.
43308 SDValue Op0 = N->getOperand(0);
43309 EVT VT = N->getValueType(0);
43310 EVT InVT = Op0.getValueType();
43311
43312 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
43313 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
43314 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
43315 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
43316 SDLoc dl(N);
43317 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
43318 InVT.getVectorNumElements());
43319 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
43320 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
43321 }
43322
43323 // Without AVX512DQ we only support i64 to float scalar conversion. For both
43324 // vectors and scalars, see if we know that the upper bits are all the sign
43325 // bit, in which case we can truncate the input to i32 and convert from that.
43326 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
43327 unsigned BitWidth = InVT.getScalarSizeInBits();
43328 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
43329 if (NumSignBits >= (BitWidth - 31)) {
43330 EVT TruncVT = MVT::i32;
43331 if (InVT.isVector())
43332 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
43333 InVT.getVectorNumElements());
43334 SDLoc dl(N);
43335 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
43336 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
43337 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
43338 }
43339 // If we're after legalize and the type is v2i32 we need to shuffle and
43340 // use CVTSI2P.
43341 assert(InVT == MVT::v2i64 && "Unexpected VT!")((InVT == MVT::v2i64 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43341, __PRETTY_FUNCTION__))
;
43342 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
43343 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
43344 { 0, 2, -1, -1 });
43345 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
43346 }
43347 }
43348
43349 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
43350 // a 32-bit target where SSE doesn't support i64->FP operations.
43351 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
43352 Op0.getOpcode() == ISD::LOAD) {
43353 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
43354 EVT LdVT = Ld->getValueType(0);
43355
43356 // This transformation is not supported if the result type is f16 or f128.
43357 if (VT == MVT::f16 || VT == MVT::f128)
43358 return SDValue();
43359
43360 // If we have AVX512DQ we can use packed conversion instructions unless
43361 // the VT is f80.
43362 if (Subtarget.hasDQI() && VT != MVT::f80)
43363 return SDValue();
43364
43365 if (Ld->isSimple() && !VT.isVector() &&
43366 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
43367 !Subtarget.is64Bit() && LdVT == MVT::i64) {
43368 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
43369 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
43370 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
43371 return FILDChain;
43372 }
43373 }
43374
43375 if (SDValue V = combineToFPTruncExtElt(N, DAG))
43376 return V;
43377
43378 return SDValue();
43379}
43380
43381static bool needCarryOrOverflowFlag(SDValue Flags) {
43382 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43382, __PRETTY_FUNCTION__))
;
43383
43384 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
43385 UI != UE; ++UI) {
43386 SDNode *User = *UI;
43387
43388 X86::CondCode CC;
43389 switch (User->getOpcode()) {
43390 default:
43391 // Be conservative.
43392 return true;
43393 case X86ISD::SETCC:
43394 case X86ISD::SETCC_CARRY:
43395 CC = (X86::CondCode)User->getConstantOperandVal(0);
43396 break;
43397 case X86ISD::BRCOND:
43398 CC = (X86::CondCode)User->getConstantOperandVal(2);
43399 break;
43400 case X86ISD::CMOV:
43401 CC = (X86::CondCode)User->getConstantOperandVal(2);
43402 break;
43403 }
43404
43405 switch (CC) {
43406 default: break;
43407 case X86::COND_A: case X86::COND_AE:
43408 case X86::COND_B: case X86::COND_BE:
43409 case X86::COND_O: case X86::COND_NO:
43410 case X86::COND_G: case X86::COND_GE:
43411 case X86::COND_L: case X86::COND_LE:
43412 return true;
43413 }
43414 }
43415
43416 return false;
43417}
43418
43419static bool onlyZeroFlagUsed(SDValue Flags) {
43420 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43420, __PRETTY_FUNCTION__))
;
43421
43422 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
43423 UI != UE; ++UI) {
43424 SDNode *User = *UI;
43425
43426 unsigned CCOpNo;
43427 switch (User->getOpcode()) {
43428 default:
43429 // Be conservative.
43430 return false;
43431 case X86ISD::SETCC: CCOpNo = 0; break;
43432 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
43433 case X86ISD::BRCOND: CCOpNo = 2; break;
43434 case X86ISD::CMOV: CCOpNo = 2; break;
43435 }
43436
43437 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
43438 if (CC != X86::COND_E && CC != X86::COND_NE)
43439 return false;
43440 }
43441
43442 return true;
43443}
43444
43445static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
43446 // Only handle test patterns.
43447 if (!isNullConstant(N->getOperand(1)))
43448 return SDValue();
43449
43450 // If we have a CMP of a truncated binop, see if we can make a smaller binop
43451 // and use its flags directly.
43452 // TODO: Maybe we should try promoting compares that only use the zero flag
43453 // first if we can prove the upper bits with computeKnownBits?
43454 SDLoc dl(N);
43455 SDValue Op = N->getOperand(0);
43456 EVT VT = Op.getValueType();
43457
43458 // If we have a constant logical shift that's only used in a comparison
43459 // against zero turn it into an equivalent AND. This allows turning it into
43460 // a TEST instruction later.
43461 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
43462 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
43463 onlyZeroFlagUsed(SDValue(N, 0))) {
43464 unsigned BitWidth = VT.getSizeInBits();
43465 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
43466 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
43467 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
43468 APInt Mask = Op.getOpcode() == ISD::SRL
43469 ? APInt::getHighBitsSet(BitWidth, MaskBits)
43470 : APInt::getLowBitsSet(BitWidth, MaskBits);
43471 if (Mask.isSignedIntN(32)) {
43472 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
43473 DAG.getConstant(Mask, dl, VT));
43474 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
43475 DAG.getConstant(0, dl, VT));
43476 }
43477 }
43478 }
43479
43480 // Look for a truncate with a single use.
43481 if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
43482 return SDValue();
43483
43484 Op = Op.getOperand(0);
43485
43486 // Arithmetic op can only have one use.
43487 if (!Op.hasOneUse())
43488 return SDValue();
43489
43490 unsigned NewOpc;
43491 switch (Op.getOpcode()) {
43492 default: return SDValue();
43493 case ISD::AND:
43494 // Skip and with constant. We have special handling for and with immediate
43495 // during isel to generate test instructions.
43496 if (isa<ConstantSDNode>(Op.getOperand(1)))
43497 return SDValue();
43498 NewOpc = X86ISD::AND;
43499 break;
43500 case ISD::OR: NewOpc = X86ISD::OR; break;
43501 case ISD::XOR: NewOpc = X86ISD::XOR; break;
43502 case ISD::ADD:
43503 // If the carry or overflow flag is used, we can't truncate.
43504 if (needCarryOrOverflowFlag(SDValue(N, 0)))
43505 return SDValue();
43506 NewOpc = X86ISD::ADD;
43507 break;
43508 case ISD::SUB:
43509 // If the carry or overflow flag is used, we can't truncate.
43510 if (needCarryOrOverflowFlag(SDValue(N, 0)))
43511 return SDValue();
43512 NewOpc = X86ISD::SUB;
43513 break;
43514 }
43515
43516 // We found an op we can narrow. Truncate its inputs.
43517 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
43518 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
43519
43520 // Use a X86 specific opcode to avoid DAG combine messing with it.
43521 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
43522 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
43523
43524 // For AND, keep a CMP so that we can match the test pattern.
43525 if (NewOpc == X86ISD::AND)
43526 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
43527 DAG.getConstant(0, dl, VT));
43528
43529 // Return the flags.
43530 return Op.getValue(1);
43531}
43532
43533static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
43534 TargetLowering::DAGCombinerInfo &DCI) {
43535 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43536, __PRETTY_FUNCTION__))
43536 "Expected X86ISD::ADD or X86ISD::SUB")(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43536, __PRETTY_FUNCTION__))
;
43537
43538 SDLoc DL(N);
43539 SDValue LHS = N->getOperand(0);
43540 SDValue RHS = N->getOperand(1);
43541 MVT VT = LHS.getSimpleValueType();
43542 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
43543
43544 // If we don't use the flag result, simplify back to a generic ADD/SUB.
43545 if (!N->hasAnyUseOfValue(1)) {
43546 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
43547 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
43548 }
43549
43550 // Fold any similar generic ADD/SUB opcodes to reuse this node.
43551 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
43552 SDValue Ops[] = {N0, N1};
43553 SDVTList VTs = DAG.getVTList(N->getValueType(0));
43554 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
43555 SDValue Op(N, 0);
43556 if (Negate)
43557 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
43558 DCI.CombineTo(GenericAddSub, Op);
43559 }
43560 };
43561 MatchGeneric(LHS, RHS, false);
43562 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
43563
43564 return SDValue();
43565}
43566
43567static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
43568 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
43569 MVT VT = N->getSimpleValueType(0);
43570 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
43571 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
43572 N->getOperand(0), N->getOperand(1),
43573 Flags);
43574 }
43575
43576 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
43577 // iff the flag result is dead.
43578 SDValue Op0 = N->getOperand(0);
43579 SDValue Op1 = N->getOperand(1);
43580 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
43581 !N->hasAnyUseOfValue(1))
43582 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
43583 Op0.getOperand(1), N->getOperand(2));
43584
43585 return SDValue();
43586}
43587
43588// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
43589static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
43590 TargetLowering::DAGCombinerInfo &DCI) {
43591 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
43592 // the result is either zero or one (depending on the input carry bit).
43593 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
43594 if (X86::isZeroNode(N->getOperand(0)) &&
43595 X86::isZeroNode(N->getOperand(1)) &&
43596 // We don't have a good way to replace an EFLAGS use, so only do this when
43597 // dead right now.
43598 SDValue(N, 1).use_empty()) {
43599 SDLoc DL(N);
43600 EVT VT = N->getValueType(0);
43601 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
43602 SDValue Res1 =
43603 DAG.getNode(ISD::AND, DL, VT,
43604 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
43605 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
43606 N->getOperand(2)),
43607 DAG.getConstant(1, DL, VT));
43608 return DCI.CombineTo(N, Res1, CarryOut);
43609 }
43610
43611 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
43612 MVT VT = N->getSimpleValueType(0);
43613 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
43614 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
43615 N->getOperand(0), N->getOperand(1),
43616 Flags);
43617 }
43618
43619 return SDValue();
43620}
43621
43622/// If this is an add or subtract where one operand is produced by a cmp+setcc,
43623/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
43624/// with CMP+{ADC, SBB}.
43625static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
43626 bool IsSub = N->getOpcode() == ISD::SUB;
43627 SDValue X = N->getOperand(0);
43628 SDValue Y = N->getOperand(1);
43629
43630 // If this is an add, canonicalize a zext operand to the RHS.
43631 // TODO: Incomplete? What if both sides are zexts?
43632 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
43633 Y.getOpcode() != ISD::ZERO_EXTEND)
43634 std::swap(X, Y);
43635
43636 // Look through a one-use zext.
43637 bool PeekedThroughZext = false;
43638 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
43639 Y = Y.getOperand(0);
43640 PeekedThroughZext = true;
43641 }
43642
43643 // If this is an add, canonicalize a setcc operand to the RHS.
43644 // TODO: Incomplete? What if both sides are setcc?
43645 // TODO: Should we allow peeking through a zext of the other operand?
43646 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
43647 Y.getOpcode() != X86ISD::SETCC)
43648 std::swap(X, Y);
43649
43650 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
43651 return SDValue();
43652
43653 SDLoc DL(N);
43654 EVT VT = N->getValueType(0);
43655 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
43656
43657 // If X is -1 or 0, then we have an opportunity to avoid constants required in
43658 // the general case below.
43659 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
43660 if (ConstantX) {
43661 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
43662 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
43663 // This is a complicated way to get -1 or 0 from the carry flag:
43664 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
43665 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
43666 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
43667 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
43668 Y.getOperand(1));
43669 }
43670
43671 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
43672 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
43673 SDValue EFLAGS = Y->getOperand(1);
43674 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
43675 EFLAGS.getValueType().isInteger() &&
43676 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
43677 // Swap the operands of a SUB, and we have the same pattern as above.
43678 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
43679 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
43680 SDValue NewSub = DAG.getNode(
43681 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
43682 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
43683 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
43684 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
43685 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
43686 NewEFLAGS);
43687 }
43688 }
43689 }
43690
43691 if (CC == X86::COND_B) {
43692 // X + SETB Z --> adc X, 0
43693 // X - SETB Z --> sbb X, 0
43694 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
43695 DAG.getVTList(VT, MVT::i32), X,
43696 DAG.getConstant(0, DL, VT), Y.getOperand(1));
43697 }
43698
43699 if (CC == X86::COND_A) {
43700 SDValue EFLAGS = Y->getOperand(1);
43701 // Try to convert COND_A into COND_B in an attempt to facilitate
43702 // materializing "setb reg".
43703 //
43704 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
43705 // cannot take an immediate as its first operand.
43706 //
43707 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
43708 EFLAGS.getValueType().isInteger() &&
43709 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
43710 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
43711 EFLAGS.getNode()->getVTList(),
43712 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
43713 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
43714 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
43715 DAG.getVTList(VT, MVT::i32), X,
43716 DAG.getConstant(0, DL, VT), NewEFLAGS);
43717 }
43718 }
43719
43720 if (CC != X86::COND_E && CC != X86::COND_NE)
43721 return SDValue();
43722
43723 SDValue Cmp = Y.getOperand(1);
43724 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
43725 !X86::isZeroNode(Cmp.getOperand(1)) ||
43726 !Cmp.getOperand(0).getValueType().isInteger())
43727 return SDValue();
43728
43729 SDValue Z = Cmp.getOperand(0);
43730 EVT ZVT = Z.getValueType();
43731
43732 // If X is -1 or 0, then we have an opportunity to avoid constants required in
43733 // the general case below.
43734 if (ConstantX) {
43735 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
43736 // fake operands:
43737 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
43738 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
43739 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
43740 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
43741 SDValue Zero = DAG.getConstant(0, DL, ZVT);
43742 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
43743 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
43744 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
43745 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
43746 SDValue(Neg.getNode(), 1));
43747 }
43748
43749 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
43750 // with fake operands:
43751 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
43752 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
43753 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
43754 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
43755 SDValue One = DAG.getConstant(1, DL, ZVT);
43756 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
43757 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
43758 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
43759 }
43760 }
43761
43762 // (cmp Z, 1) sets the carry flag if Z is 0.
43763 SDValue One = DAG.getConstant(1, DL, ZVT);
43764 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
43765
43766 // Add the flags type for ADC/SBB nodes.
43767 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
43768
43769 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
43770 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
43771 if (CC == X86::COND_NE)
43772 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
43773 DAG.getConstant(-1ULL, DL, VT), Cmp1);
43774
43775 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
43776 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
43777 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
43778 DAG.getConstant(0, DL, VT), Cmp1);
43779}
43780
43781static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
43782 const X86Subtarget &Subtarget) {
43783 if (!Subtarget.hasSSE2())
43784 return SDValue();
43785
43786 EVT VT = N->getValueType(0);
43787
43788 // If the vector size is less than 128, or greater than the supported RegSize,
43789 // do not use PMADD.
43790 if (!VT.isVector() || VT.getVectorNumElements() < 8)
43791 return SDValue();
43792
43793 SDValue Op0 = N->getOperand(0);
43794 SDValue Op1 = N->getOperand(1);
43795
43796 auto UsePMADDWD = [&](SDValue Op) {
43797 ShrinkMode Mode;
43798 return Op.getOpcode() == ISD::MUL &&
43799 canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
43800 Mode != ShrinkMode::MULU16 &&
43801 (!Subtarget.hasSSE41() ||
43802 (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
43803 Op->isOnlyUserOf(Op.getOperand(1).getNode())));
43804 };
43805
43806 SDValue MulOp, OtherOp;
43807 if (UsePMADDWD(Op0)) {
43808 MulOp = Op0;
43809 OtherOp = Op1;
43810 } else if (UsePMADDWD(Op1)) {
43811 MulOp = Op1;
43812 OtherOp = Op0;
43813 } else
43814 return SDValue();
43815
43816 SDLoc DL(N);
43817 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
43818 VT.getVectorNumElements());
43819 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
43820 VT.getVectorNumElements() / 2);
43821
43822 // Shrink the operands of mul.
43823 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
43824 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
43825
43826 // Madd vector size is half of the original vector size
43827 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43828 ArrayRef<SDValue> Ops) {
43829 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43830 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43831 };
43832 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
43833 PMADDWDBuilder);
43834 // Fill the rest of the output with 0
43835 SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
43836 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
43837
43838 // Preserve the reduction flag on the ADD. We may need to revisit for the
43839 // other operand.
43840 SDNodeFlags Flags;
43841 Flags.setVectorReduction(true);
43842 return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
43843}
43844
43845static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
43846 const X86Subtarget &Subtarget) {
43847 if (!Subtarget.hasSSE2())
43848 return SDValue();
43849
43850 SDLoc DL(N);
43851 EVT VT = N->getValueType(0);
43852
43853 // TODO: There's nothing special about i32, any integer type above i16 should
43854 // work just as well.
43855 if (!VT.isVector() || !VT.isSimple() ||
43856 !(VT.getVectorElementType() == MVT::i32))
43857 return SDValue();
43858
43859 unsigned RegSize = 128;
43860 if (Subtarget.useBWIRegs())
43861 RegSize = 512;
43862 else if (Subtarget.hasAVX())
43863 RegSize = 256;
43864
43865 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
43866 // TODO: We should be able to handle larger vectors by splitting them before
43867 // feeding them into several SADs, and then reducing over those.
43868 if (VT.getSizeInBits() / 4 > RegSize)
43869 return SDValue();
43870
43871 // We know N is a reduction add. To match SAD, we need one of the operands to
43872 // be an ABS.
43873 SDValue AbsOp = N->getOperand(0);
43874 SDValue OtherOp = N->getOperand(1);
43875 if (AbsOp.getOpcode() != ISD::ABS)
43876 std::swap(AbsOp, OtherOp);
43877 if (AbsOp.getOpcode() != ISD::ABS)
43878 return SDValue();
43879
43880 // Check whether we have an abs-diff pattern feeding into the select.
43881 SDValue SadOp0, SadOp1;
43882 if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
43883 return SDValue();
43884
43885 // SAD pattern detected. Now build a SAD instruction and an addition for
43886 // reduction. Note that the number of elements of the result of SAD is less
43887 // than the number of elements of its input. Therefore, we could only update
43888 // part of elements in the reduction vector.
43889 SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
43890
43891 // The output of PSADBW is a vector of i64.
43892 // We need to turn the vector of i64 into a vector of i32.
43893 // If the reduction vector is at least as wide as the psadbw result, just
43894 // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
43895 // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
43896 // result to v2i32 which will be removed by type legalization. If we/ widen
43897 // narrow vectors then we bitcast to v4i32 and extract v2i32.
43898 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
43899 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
43900
43901 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
43902 // Fill the upper elements with zero to match the add width.
43903 assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs")((VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43903, __PRETTY_FUNCTION__))
;
43904 unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
43905 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
43906 Ops[0] = Sad;
43907 Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
43908 } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
43909 Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
43910 DAG.getIntPtrConstant(0, DL));
43911 }
43912
43913 // Preserve the reduction flag on the ADD. We may need to revisit for the
43914 // other operand.
43915 SDNodeFlags Flags;
43916 Flags.setVectorReduction(true);
43917 return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
43918}
43919
43920static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
43921 const SDLoc &DL, EVT VT,
43922 const X86Subtarget &Subtarget) {
43923 // Example of pattern we try to detect:
43924 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
43925 //(add (build_vector (extract_elt t, 0),
43926 // (extract_elt t, 2),
43927 // (extract_elt t, 4),
43928 // (extract_elt t, 6)),
43929 // (build_vector (extract_elt t, 1),
43930 // (extract_elt t, 3),
43931 // (extract_elt t, 5),
43932 // (extract_elt t, 7)))
43933
43934 if (!Subtarget.hasSSE2())
43935 return SDValue();
43936
43937 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
43938 Op1.getOpcode() != ISD::BUILD_VECTOR)
43939 return SDValue();
43940
43941 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
43942 VT.getVectorNumElements() < 4 ||
43943 !isPowerOf2_32(VT.getVectorNumElements()))
43944 return SDValue();
43945
43946 // Check if one of Op0,Op1 is of the form:
43947 // (build_vector (extract_elt Mul, 0),
43948 // (extract_elt Mul, 2),
43949 // (extract_elt Mul, 4),
43950 // ...
43951 // the other is of the form:
43952 // (build_vector (extract_elt Mul, 1),
43953 // (extract_elt Mul, 3),
43954 // (extract_elt Mul, 5),
43955 // ...
43956 // and identify Mul.
43957 SDValue Mul;
43958 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
43959 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
43960 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
43961 // TODO: Be more tolerant to undefs.
43962 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43963 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43964 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43965 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
43966 return SDValue();
43967 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
43968 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
43969 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
43970 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
43971 if (!Const0L || !Const1L || !Const0H || !Const1H)
43972 return SDValue();
43973 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
43974 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
43975 // Commutativity of mul allows factors of a product to reorder.
43976 if (Idx0L > Idx1L)
43977 std::swap(Idx0L, Idx1L);
43978 if (Idx0H > Idx1H)
43979 std::swap(Idx0H, Idx1H);
43980 // Commutativity of add allows pairs of factors to reorder.
43981 if (Idx0L > Idx0H) {
43982 std::swap(Idx0L, Idx0H);
43983 std::swap(Idx1L, Idx1H);
43984 }
43985 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
43986 Idx1H != 2 * i + 3)
43987 return SDValue();
43988 if (!Mul) {
43989 // First time an extract_elt's source vector is visited. Must be a MUL
43990 // with 2X number of vector elements than the BUILD_VECTOR.
43991 // Both extracts must be from same MUL.
43992 Mul = Op0L->getOperand(0);
43993 if (Mul->getOpcode() != ISD::MUL ||
43994 Mul.getValueType().getVectorNumElements() != 2 * e)
43995 return SDValue();
43996 }
43997 // Check that the extract is from the same MUL previously seen.
43998 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
43999 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
44000 return SDValue();
44001 }
44002
44003 // Check if the Mul source can be safely shrunk.
44004 ShrinkMode Mode;
44005 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
44006 Mode == ShrinkMode::MULU16)
44007 return SDValue();
44008
44009 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44010 ArrayRef<SDValue> Ops) {
44011 // Shrink by adding truncate nodes and let DAGCombine fold with the
44012 // sources.
44013 EVT InVT = Ops[0].getValueType();
44014 assert(InVT.getScalarType() == MVT::i32 &&((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44015, __PRETTY_FUNCTION__))
44015 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44015, __PRETTY_FUNCTION__))
;
44016 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44016, __PRETTY_FUNCTION__))
;
44017 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
44018 InVT.getVectorNumElements() / 2);
44019 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
44020 InVT.getVectorNumElements());
44021 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
44022 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
44023 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
44024 };
44025 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
44026 { Mul.getOperand(0), Mul.getOperand(1) },
44027 PMADDBuilder);
44028}
44029
44030// Attempt to turn this pattern into PMADDWD.
44031// (mul (add (sext (build_vector)), (sext (build_vector))),
44032// (add (sext (build_vector)), (sext (build_vector)))
44033static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
44034 const SDLoc &DL, EVT VT,
44035 const X86Subtarget &Subtarget) {
44036 if (!Subtarget.hasSSE2())
44037 return SDValue();
44038
44039 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
44040 return SDValue();
44041
44042 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
44043 VT.getVectorNumElements() < 4 ||
44044 !isPowerOf2_32(VT.getVectorNumElements()))
44045 return SDValue();
44046
44047 SDValue N00 = N0.getOperand(0);
44048 SDValue N01 = N0.getOperand(1);
44049 SDValue N10 = N1.getOperand(0);
44050 SDValue N11 = N1.getOperand(1);
44051
44052 // All inputs need to be sign extends.
44053 // TODO: Support ZERO_EXTEND from known positive?
44054 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
44055 N01.getOpcode() != ISD::SIGN_EXTEND ||
44056 N10.getOpcode() != ISD::SIGN_EXTEND ||
44057 N11.getOpcode() != ISD::SIGN_EXTEND)
44058 return SDValue();
44059
44060 // Peek through the extends.
44061 N00 = N00.getOperand(0);
44062 N01 = N01.getOperand(0);
44063 N10 = N10.getOperand(0);
44064 N11 = N11.getOperand(0);
44065
44066 // Must be extending from vXi16.
44067 EVT InVT = N00.getValueType();
44068 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
44069 N10.getValueType() != InVT || N11.getValueType() != InVT)
44070 return SDValue();
44071
44072 // All inputs should be build_vectors.
44073 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
44074 N01.getOpcode() != ISD::BUILD_VECTOR ||
44075 N10.getOpcode() != ISD::BUILD_VECTOR ||
44076 N11.getOpcode() != ISD::BUILD_VECTOR)
44077 return SDValue();
44078
44079 // For each element, we need to ensure we have an odd element from one vector
44080 // multiplied by the odd element of another vector and the even element from
44081 // one of the same vectors being multiplied by the even element from the
44082 // other vector. So we need to make sure for each element i, this operator
44083 // is being performed:
44084 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
44085 SDValue In0, In1;
44086 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
44087 SDValue N00Elt = N00.getOperand(i);
44088 SDValue N01Elt = N01.getOperand(i);
44089 SDValue N10Elt = N10.getOperand(i);
44090 SDValue N11Elt = N11.getOperand(i);
44091 // TODO: Be more tolerant to undefs.
44092 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
44093 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
44094 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
44095 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
44096 return SDValue();
44097 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
44098 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
44099 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
44100 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
44101 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
44102 return SDValue();
44103 unsigned IdxN00 = ConstN00Elt->getZExtValue();
44104 unsigned IdxN01 = ConstN01Elt->getZExtValue();
44105 unsigned IdxN10 = ConstN10Elt->getZExtValue();
44106 unsigned IdxN11 = ConstN11Elt->getZExtValue();
44107 // Add is commutative so indices can be reordered.
44108 if (IdxN00 > IdxN10) {
44109 std::swap(IdxN00, IdxN10);
44110 std::swap(IdxN01, IdxN11);
44111 }
44112 // N0 indices be the even element. N1 indices must be the next odd element.
44113 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
44114 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
44115 return SDValue();
44116 SDValue N00In = N00Elt.getOperand(0);
44117 SDValue N01In = N01Elt.getOperand(0);
44118 SDValue N10In = N10Elt.getOperand(0);
44119 SDValue N11In = N11Elt.getOperand(0);
44120 // First time we find an input capture it.
44121 if (!In0) {
44122 In0 = N00In;
44123 In1 = N01In;
44124 }
44125 // Mul is commutative so the input vectors can be in any order.
44126 // Canonicalize to make the compares easier.
44127 if (In0 != N00In)
44128 std::swap(N00In, N01In);
44129 if (In0 != N10In)
44130 std::swap(N10In, N11In);
44131 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
44132 return SDValue();
44133 }
44134
44135 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44136 ArrayRef<SDValue> Ops) {
44137 // Shrink by adding truncate nodes and let DAGCombine fold with the
44138 // sources.
44139 EVT OpVT = Ops[0].getValueType();
44140 assert(OpVT.getScalarType() == MVT::i16 &&((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44141, __PRETTY_FUNCTION__))
44141 "Unexpected scalar element type")((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44141, __PRETTY_FUNCTION__))
;
44142 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((OpVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44142, __PRETTY_FUNCTION__))
;
44143 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
44144 OpVT.getVectorNumElements() / 2);
44145 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
44146 };
44147 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
44148 PMADDBuilder);
44149}
44150
44151static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
44152 TargetLowering::DAGCombinerInfo &DCI,
44153 const X86Subtarget &Subtarget) {
44154 const SDNodeFlags Flags = N->getFlags();
44155 if (Flags.hasVectorReduction()) {
44156 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
44157 return Sad;
44158 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
44159 return MAdd;
44160 }
44161 EVT VT = N->getValueType(0);
44162 SDValue Op0 = N->getOperand(0);
44163 SDValue Op1 = N->getOperand(1);
44164
44165 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
44166 return MAdd;
44167 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
44168 return MAdd;
44169
44170 // Try to synthesize horizontal adds from adds of shuffles.
44171 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
44172 VT == MVT::v8i32) &&
44173 Subtarget.hasSSSE3() &&
44174 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
44175 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44176 ArrayRef<SDValue> Ops) {
44177 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
44178 };
44179 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
44180 HADDBuilder);
44181 }
44182
44183 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
44184 // (sub Y, (sext (vXi1 X))).
44185 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
44186 // generic DAG combine without a legal type check, but adding this there
44187 // caused regressions.
44188 if (VT.isVector()) {
44189 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44190 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
44191 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
44192 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
44193 SDLoc DL(N);
44194 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
44195 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
44196 }
44197
44198 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
44199 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
44200 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
44201 SDLoc DL(N);
44202 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
44203 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
44204 }
44205 }
44206
44207 return combineAddOrSubToADCOrSBB(N, DAG);
44208}
44209
44210static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
44211 const X86Subtarget &Subtarget) {
44212 SDValue Op0 = N->getOperand(0);
44213 SDValue Op1 = N->getOperand(1);
44214 EVT VT = N->getValueType(0);
44215
44216 if (!VT.isVector())
44217 return SDValue();
44218
44219 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
44220 // is only worth it with SSSE3 (PSHUFB).
44221 EVT EltVT = VT.getVectorElementType();
44222 if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
44223 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
44224 !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
44225 return SDValue();
44226
44227 SDValue SubusLHS, SubusRHS;
44228 // Try to find umax(a,b) - b or a - umin(a,b) patterns
44229 // they may be converted to subus(a,b).
44230 // TODO: Need to add IR canonicalization for this code.
44231 if (Op0.getOpcode() == ISD::UMAX) {
44232 SubusRHS = Op1;
44233 SDValue MaxLHS = Op0.getOperand(0);
44234 SDValue MaxRHS = Op0.getOperand(1);
44235 if (MaxLHS == Op1)
44236 SubusLHS = MaxRHS;
44237 else if (MaxRHS == Op1)
44238 SubusLHS = MaxLHS;
44239 else
44240 return SDValue();
44241 } else if (Op1.getOpcode() == ISD::UMIN) {
44242 SubusLHS = Op0;
44243 SDValue MinLHS = Op1.getOperand(0);
44244 SDValue MinRHS = Op1.getOperand(1);
44245 if (MinLHS == Op0)
44246 SubusRHS = MinRHS;
44247 else if (MinRHS == Op0)
44248 SubusRHS = MinLHS;
44249 else
44250 return SDValue();
44251 } else
44252 return SDValue();
44253
44254 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
44255 // special preprocessing in some cases.
44256 if (EltVT == MVT::i8 || EltVT == MVT::i16)
44257 return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
44258
44259 assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44260, __PRETTY_FUNCTION__))
44260 "Unexpected VT!")(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44260, __PRETTY_FUNCTION__))
;
44261
44262 // Special preprocessing case can be only applied
44263 // if the value was zero extended from 16 bit,
44264 // so we require first 16 bits to be zeros for 32 bit
44265 // values, or first 48 bits for 64 bit values.
44266 KnownBits Known = DAG.computeKnownBits(SubusLHS);
44267 unsigned NumZeros = Known.countMinLeadingZeros();
44268 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
44269 return SDValue();
44270
44271 EVT ExtType = SubusLHS.getValueType();
44272 EVT ShrinkedType;
44273 if (VT == MVT::v8i32 || VT == MVT::v8i64)
44274 ShrinkedType = MVT::v8i16;
44275 else
44276 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
44277
44278 // If SubusLHS is zeroextended - truncate SubusRHS to it's
44279 // size SubusRHS = umin(0xFFF.., SubusRHS).
44280 SDValue SaturationConst =
44281 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
44282 ShrinkedType.getScalarSizeInBits()),
44283 SDLoc(SubusLHS), ExtType);
44284 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
44285 SaturationConst);
44286 SDValue NewSubusLHS =
44287 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
44288 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
44289 SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
44290 NewSubusLHS, NewSubusRHS);
44291
44292 // Zero extend the result, it may be used somewhere as 32 bit,
44293 // if not zext and following trunc will shrink.
44294 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
44295}
44296
44297static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
44298 TargetLowering::DAGCombinerInfo &DCI,
44299 const X86Subtarget &Subtarget) {
44300 SDValue Op0 = N->getOperand(0);
44301 SDValue Op1 = N->getOperand(1);
44302
44303 // X86 can't encode an immediate LHS of a sub. See if we can push the
44304 // negation into a preceding instruction.
44305 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
44306 // If the RHS of the sub is a XOR with one use and a constant, invert the
44307 // immediate. Then add one to the LHS of the sub so we can turn
44308 // X-Y -> X+~Y+1, saving one register.
44309 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
44310 isa<ConstantSDNode>(Op1.getOperand(1))) {
44311 const APInt &XorC = Op1.getConstantOperandAPInt(1);
44312 EVT VT = Op0.getValueType();
44313 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
44314 Op1.getOperand(0),
44315 DAG.getConstant(~XorC, SDLoc(Op1), VT));
44316 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
44317 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
44318 }
44319 }
44320
44321 // Try to synthesize horizontal subs from subs of shuffles.
44322 EVT VT = N->getValueType(0);
44323 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
44324 VT == MVT::v8i32) &&
44325 Subtarget.hasSSSE3() &&
44326 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
44327 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44328 ArrayRef<SDValue> Ops) {
44329 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
44330 };
44331 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
44332 HSUBBuilder);
44333 }
44334
44335 // Try to create PSUBUS if SUB's argument is max/min
44336 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
44337 return V;
44338
44339 return combineAddOrSubToADCOrSBB(N, DAG);
44340}
44341
44342static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
44343 const X86Subtarget &Subtarget) {
44344 MVT VT = N->getSimpleValueType(0);
44345 SDLoc DL(N);
44346
44347 if (N->getOperand(0) == N->getOperand(1)) {
44348 if (N->getOpcode() == X86ISD::PCMPEQ)
44349 return DAG.getConstant(-1, DL, VT);
44350 if (N->getOpcode() == X86ISD::PCMPGT)
44351 return DAG.getConstant(0, DL, VT);
44352 }
44353
44354 return SDValue();
44355}
44356
44357/// Helper that combines an array of subvector ops as if they were the operands
44358/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
44359/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
44360static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
44361 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
44362 TargetLowering::DAGCombinerInfo &DCI,
44363 const X86Subtarget &Subtarget) {
44364 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44364, __PRETTY_FUNCTION__))
;
44365
44366 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
44367 return DAG.getUNDEF(VT);
44368
44369 if (llvm::all_of(Ops, [](SDValue Op) {
44370 return ISD::isBuildVectorAllZeros(Op.getNode());
44371 }))
44372 return getZeroVector(VT, Subtarget, DAG, DL);
44373
44374 SDValue Op0 = Ops[0];
44375
44376 // Fold subvector loads into one.
44377 // If needed, look through bitcasts to get to the load.
44378 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
44379 bool Fast;
44380 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
44381 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
44382 *FirstLd->getMemOperand(), &Fast) &&
44383 Fast) {
44384 if (SDValue Ld =
44385 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
44386 return Ld;
44387 }
44388 }
44389
44390 // Repeated subvectors.
44391 if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
44392 // If this broadcast/subv_broadcast is inserted into both halves, use a
44393 // larger broadcast/subv_broadcast.
44394 if (Op0.getOpcode() == X86ISD::VBROADCAST ||
44395 Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
44396 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
44397
44398 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
44399 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
44400 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
44401 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
44402 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
44403 Op0.getOperand(0),
44404 DAG.getIntPtrConstant(0, DL)));
44405
44406 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
44407 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
44408 (Subtarget.hasAVX2() ||
44409 (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
44410 Op0.getOperand(0).getValueType() == VT.getScalarType())
44411 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
44412 }
44413
44414 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
44415
44416 // Repeated opcode.
44417 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
44418 // but it currently struggles with different vector widths.
44419 if (llvm::all_of(Ops, [Op0](SDValue Op) {
44420 return Op.getOpcode() == Op0.getOpcode();
44421 })) {
44422 unsigned NumOps = Ops.size();
44423 switch (Op0.getOpcode()) {
44424 case X86ISD::PSHUFHW:
44425 case X86ISD::PSHUFLW:
44426 case X86ISD::PSHUFD:
44427 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
44428 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
44429 SmallVector<SDValue, 2> Src;
44430 for (unsigned i = 0; i != NumOps; ++i)
44431 Src.push_back(Ops[i].getOperand(0));
44432 return DAG.getNode(Op0.getOpcode(), DL, VT,
44433 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
44434 Op0.getOperand(1));
44435 }
44436 LLVM_FALLTHROUGH[[gnu::fallthrough]];
44437 case X86ISD::VPERMILPI:
44438 // TODO - add support for vXf64/vXi64 shuffles.
44439 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
44440 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
44441 SmallVector<SDValue, 2> Src;
44442 for (unsigned i = 0; i != NumOps; ++i)
44443 Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
44444 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
44445 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
44446 Op0.getOperand(1));
44447 return DAG.getBitcast(VT, Res);
44448 }
44449 break;
44450 case X86ISD::PACKUS:
44451 if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
44452 SmallVector<SDValue, 2> LHS, RHS;
44453 for (unsigned i = 0; i != NumOps; ++i) {
44454 LHS.push_back(Ops[i].getOperand(0));
44455 RHS.push_back(Ops[i].getOperand(1));
44456 }
44457 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
44458 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
44459 NumOps * SrcVT.getVectorNumElements());
44460 return DAG.getNode(Op0.getOpcode(), DL, VT,
44461 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
44462 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
44463 }
44464 break;
44465 }
44466 }
44467
44468 return SDValue();
44469}
44470
44471static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
44472 TargetLowering::DAGCombinerInfo &DCI,
44473 const X86Subtarget &Subtarget) {
44474 EVT VT = N->getValueType(0);
44475 EVT SrcVT = N->getOperand(0).getValueType();
44476 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44477
44478 // Don't do anything for i1 vectors.
44479 if (VT.getVectorElementType() == MVT::i1)
44480 return SDValue();
44481
44482 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
44483 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
44484 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
44485 DCI, Subtarget))
44486 return R;
44487 }
44488
44489 return SDValue();
44490}
44491
44492static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
44493 TargetLowering::DAGCombinerInfo &DCI,
44494 const X86Subtarget &Subtarget) {
44495 if (DCI.isBeforeLegalizeOps())
44496 return SDValue();
44497
44498 MVT OpVT = N->getSimpleValueType(0);
44499
44500 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
44501
44502 SDLoc dl(N);
44503 SDValue Vec = N->getOperand(0);
44504 SDValue SubVec = N->getOperand(1);
44505
44506 uint64_t IdxVal = N->getConstantOperandVal(2);
44507 MVT SubVecVT = SubVec.getSimpleValueType();
44508
44509 if (Vec.isUndef() && SubVec.isUndef())
44510 return DAG.getUNDEF(OpVT);
44511
44512 // Inserting undefs/zeros into zeros/undefs is a zero vector.
44513 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
44514 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
44515 return getZeroVector(OpVT, Subtarget, DAG, dl);
44516
44517 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
44518 // If we're inserting into a zero vector and then into a larger zero vector,
44519 // just insert into the larger zero vector directly.
44520 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
44521 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
44522 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
44523 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
44524 getZeroVector(OpVT, Subtarget, DAG, dl),
44525 SubVec.getOperand(1),
44526 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
44527 }
44528
44529 // If we're inserting into a zero vector and our input was extracted from an
44530 // insert into a zero vector of the same type and the extraction was at
44531 // least as large as the original insertion. Just insert the original
44532 // subvector into a zero vector.
44533 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
44534 isNullConstant(SubVec.getOperand(1)) &&
44535 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
44536 SDValue Ins = SubVec.getOperand(0);
44537 if (isNullConstant(Ins.getOperand(2)) &&
44538 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
44539 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
44540 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
44541 getZeroVector(OpVT, Subtarget, DAG, dl),
44542 Ins.getOperand(1), N->getOperand(2));
44543 }
44544 }
44545
44546 // Stop here if this is an i1 vector.
44547 if (IsI1Vector)
44548 return SDValue();
44549
44550 // If this is an insert of an extract, combine to a shuffle. Don't do this
44551 // if the insert or extract can be represented with a subregister operation.
44552 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44553 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
44554 (IdxVal != 0 || !Vec.isUndef())) {
44555 int ExtIdxVal = SubVec.getConstantOperandVal(1);
44556 if (ExtIdxVal != 0) {
44557 int VecNumElts = OpVT.getVectorNumElements();
44558 int SubVecNumElts = SubVecVT.getVectorNumElements();
44559 SmallVector<int, 64> Mask(VecNumElts);
44560 // First create an identity shuffle mask.
44561 for (int i = 0; i != VecNumElts; ++i)
44562 Mask[i] = i;
44563 // Now insert the extracted portion.
44564 for (int i = 0; i != SubVecNumElts; ++i)
44565 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
44566
44567 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
44568 }
44569 }
44570
44571 // Match concat_vector style patterns.
44572 SmallVector<SDValue, 2> SubVectorOps;
44573 if (collectConcatOps(N, SubVectorOps)) {
44574 if (SDValue Fold =
44575 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
44576 return Fold;
44577
44578 // If we're inserting all zeros into the upper half, change this to
44579 // a concat with zero. We will match this to a move
44580 // with implicit upper bit zeroing during isel.
44581 // We do this here because we don't want combineConcatVectorOps to
44582 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
44583 if (SubVectorOps.size() == 2 &&
44584 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
44585 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
44586 getZeroVector(OpVT, Subtarget, DAG, dl),
44587 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
44588 }
44589
44590 // If this is a broadcast insert into an upper undef, use a larger broadcast.
44591 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
44592 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
44593
44594 // If this is a broadcast load inserted into an upper undef, use a larger
44595 // broadcast load.
44596 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
44597 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
44598 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
44599 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
44600 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
44601 SDValue BcastLd =
44602 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
44603 MemIntr->getMemoryVT(),
44604 MemIntr->getMemOperand());
44605 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
44606 return BcastLd;
44607 }
44608
44609 return SDValue();
44610}
44611
44612/// If we are extracting a subvector of a vector select and the select condition
44613/// is composed of concatenated vectors, try to narrow the select width. This
44614/// is a common pattern for AVX1 integer code because 256-bit selects may be
44615/// legal, but there is almost no integer math/logic available for 256-bit.
44616/// This function should only be called with legal types (otherwise, the calls
44617/// to get simple value types will assert).
44618static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
44619 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
44620 SmallVector<SDValue, 4> CatOps;
44621 if (Sel.getOpcode() != ISD::VSELECT ||
44622 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
44623 return SDValue();
44624
44625 // Note: We assume simple value types because this should only be called with
44626 // legal operations/types.
44627 // TODO: This can be extended to handle extraction to 256-bits.
44628 MVT VT = Ext->getSimpleValueType(0);
44629 if (!VT.is128BitVector())
44630 return SDValue();
44631
44632 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
44633 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
44634 return SDValue();
44635
44636 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
44637 MVT SelVT = Sel.getSimpleValueType();
44638 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44639, __PRETTY_FUNCTION__))
44639 "Unexpected vector type with legal operations")(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44639, __PRETTY_FUNCTION__))
;
44640
44641 unsigned SelElts = SelVT.getVectorNumElements();
44642 unsigned CastedElts = WideVT.getVectorNumElements();
44643 unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
44644 if (SelElts % CastedElts == 0) {
44645 // The select has the same or more (narrower) elements than the extract
44646 // operand. The extraction index gets scaled by that factor.
44647 ExtIdx *= (SelElts / CastedElts);
44648 } else if (CastedElts % SelElts == 0) {
44649 // The select has less (wider) elements than the extract operand. Make sure
44650 // that the extraction index can be divided evenly.
44651 unsigned IndexDivisor = CastedElts / SelElts;
44652 if (ExtIdx % IndexDivisor != 0)
44653 return SDValue();
44654 ExtIdx /= IndexDivisor;
44655 } else {
44656 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44656)
;
44657 }
44658
44659 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
44660 unsigned NarrowElts = SelElts / NarrowingFactor;
44661 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
44662 SDLoc DL(Ext);
44663 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
44664 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
44665 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
44666 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
44667 return DAG.getBitcast(VT, NarrowSel);
44668}
44669
44670static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
44671 TargetLowering::DAGCombinerInfo &DCI,
44672 const X86Subtarget &Subtarget) {
44673 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
44674 // eventually get combined/lowered into ANDNP) with a concatenated operand,
44675 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
44676 // We let generic combining take over from there to simplify the
44677 // insert/extract and 'not'.
44678 // This pattern emerges during AVX1 legalization. We handle it before lowering
44679 // to avoid complications like splitting constant vector loads.
44680
44681 // Capture the original wide type in the likely case that we need to bitcast
44682 // back to this type.
44683 if (!N->getValueType(0).isSimple())
44684 return SDValue();
44685
44686 MVT VT = N->getSimpleValueType(0);
44687 SDValue InVec = N->getOperand(0);
44688 SDValue InVecBC = peekThroughBitcasts(InVec);
44689 EVT InVecVT = InVec.getValueType();
44690 EVT InVecBCVT = InVecBC.getValueType();
44691 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44692
44693 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
44694 TLI.isTypeLegal(InVecVT) &&
44695 InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
44696 auto isConcatenatedNot = [] (SDValue V) {
44697 V = peekThroughBitcasts(V);
44698 if (!isBitwiseNot(V))
44699 return false;
44700 SDValue NotOp = V->getOperand(0);
44701 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
44702 };
44703 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
44704 isConcatenatedNot(InVecBC.getOperand(1))) {
44705 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
44706 SDValue Concat = split256IntArith(InVecBC, DAG);
44707 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
44708 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
44709 }
44710 }
44711
44712 if (DCI.isBeforeLegalizeOps())
44713 return SDValue();
44714
44715 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
44716 return V;
44717
44718 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
44719
44720 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
44721 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
44722
44723 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
44724 if (VT.getScalarType() == MVT::i1)
44725 return DAG.getConstant(1, SDLoc(N), VT);
44726 return getOnesVector(VT, DAG, SDLoc(N));
44727 }
44728
44729 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
44730 return DAG.getBuildVector(
44731 VT, SDLoc(N),
44732 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
44733
44734 // Try to move vector bitcast after extract_subv by scaling extraction index:
44735 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
44736 // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
44737 if (InVec != InVecBC && InVecBCVT.isVector()) {
44738 unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
44739 unsigned DestNumElts = InVecVT.getVectorNumElements();
44740 if ((DestNumElts % SrcNumElts) == 0) {
44741 unsigned DestSrcRatio = DestNumElts / SrcNumElts;
44742 if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
44743 unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
44744 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
44745 InVecBCVT.getScalarType(), NewExtNumElts);
44746 if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
44747 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
44748 unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
44749 SDLoc DL(N);
44750 SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
44751 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
44752 InVecBC, NewIndex);
44753 return DAG.getBitcast(VT, NewExtract);
44754 }
44755 }
44756 }
44757 }
44758
44759 // If we are extracting from an insert into a zero vector, replace with a
44760 // smaller insert into zero if we don't access less than the original
44761 // subvector. Don't do this for i1 vectors.
44762 if (VT.getVectorElementType() != MVT::i1 &&
44763 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
44764 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
44765 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
44766 InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
44767 SDLoc DL(N);
44768 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44769 getZeroVector(VT, Subtarget, DAG, DL),
44770 InVec.getOperand(1), InVec.getOperand(2));
44771 }
44772
44773 // If we're extracting from a broadcast then we're better off just
44774 // broadcasting to the smaller type directly, assuming this is the only use.
44775 // As its a broadcast we don't care about the extraction index.
44776 if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
44777 InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
44778 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
44779
44780 if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
44781 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
44782 if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
44783 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
44784 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
44785 SDValue BcastLd =
44786 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
44787 MemIntr->getMemoryVT(),
44788 MemIntr->getMemOperand());
44789 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
44790 return BcastLd;
44791 }
44792 }
44793
44794 // If we're extracting the lowest subvector and we're the only user,
44795 // we may be able to perform this with a smaller vector width.
44796 if (IdxVal == 0 && InVec.hasOneUse()) {
44797 unsigned InOpcode = InVec.getOpcode();
44798 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
44799 // v2f64 CVTDQ2PD(v4i32).
44800 if (InOpcode == ISD::SINT_TO_FP &&
44801 InVec.getOperand(0).getValueType() == MVT::v4i32) {
44802 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
44803 }
44804 // v2f64 CVTUDQ2PD(v4i32).
44805 if (InOpcode == ISD::UINT_TO_FP &&
44806 InVec.getOperand(0).getValueType() == MVT::v4i32) {
44807 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
44808 }
44809 // v2f64 CVTPS2PD(v4f32).
44810 if (InOpcode == ISD::FP_EXTEND &&
44811 InVec.getOperand(0).getValueType() == MVT::v4f32) {
44812 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
44813 }
44814 }
44815 if ((InOpcode == ISD::ANY_EXTEND ||
44816 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
44817 InOpcode == ISD::ZERO_EXTEND ||
44818 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
44819 InOpcode == ISD::SIGN_EXTEND ||
44820 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
44821 VT.is128BitVector() &&
44822 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
44823 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
44824 return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
44825 }
44826 if (InOpcode == ISD::VSELECT &&
44827 InVec.getOperand(0).getValueType().is256BitVector() &&
44828 InVec.getOperand(1).getValueType().is256BitVector() &&
44829 InVec.getOperand(2).getValueType().is256BitVector()) {
44830 SDLoc DL(N);
44831 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
44832 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
44833 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
44834 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
44835 }
44836 }
44837
44838 return SDValue();
44839}
44840
44841static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
44842 EVT VT = N->getValueType(0);
44843 SDValue Src = N->getOperand(0);
44844 SDLoc DL(N);
44845
44846 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
44847 // This occurs frequently in our masked scalar intrinsic code and our
44848 // floating point select lowering with AVX512.
44849 // TODO: SimplifyDemandedBits instead?
44850 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
44851 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
44852 if (C->getAPIntValue().isOneValue())
44853 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
44854 Src.getOperand(0));
44855
44856 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
44857 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44858 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
44859 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
44860 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
44861 if (C->isNullValue())
44862 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
44863 Src.getOperand(1));
44864
44865 // Reduce v2i64 to v4i32 if we don't need the upper bits.
44866 // TODO: Move to DAGCombine?
44867 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
44868 Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
44869 Src.getOperand(0).getScalarValueSizeInBits() <= 32)
44870 return DAG.getBitcast(
44871 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
44872 DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
44873
44874 return SDValue();
44875}
44876
44877// Simplify PMULDQ and PMULUDQ operations.
44878static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
44879 TargetLowering::DAGCombinerInfo &DCI,
44880 const X86Subtarget &Subtarget) {
44881 SDValue LHS = N->getOperand(0);
44882 SDValue RHS = N->getOperand(1);
44883
44884 // Canonicalize constant to RHS.
44885 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
44886 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
44887 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
44888
44889 // Multiply by zero.
44890 // Don't return RHS as it may contain UNDEFs.
44891 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
44892 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
44893
44894 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44895 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44896 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
44897 return SDValue(N, 0);
44898
44899 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
44900 // convert it to any_extend_invec, due to the LegalOperations check, do the
44901 // conversion directly to a vector shuffle manually. This exposes combine
44902 // opportunities missed by combineExtInVec not calling
44903 // combineX86ShufflesRecursively on SSE4.1 targets.
44904 // FIXME: This is basically a hack around several other issues related to
44905 // ANY_EXTEND_VECTOR_INREG.
44906 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
44907 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
44908 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
44909 LHS.getOperand(0).getValueType() == MVT::v4i32) {
44910 SDLoc dl(N);
44911 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
44912 LHS.getOperand(0), { 0, -1, 1, -1 });
44913 LHS = DAG.getBitcast(MVT::v2i64, LHS);
44914 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
44915 }
44916 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
44917 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
44918 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
44919 RHS.getOperand(0).getValueType() == MVT::v4i32) {
44920 SDLoc dl(N);
44921 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
44922 RHS.getOperand(0), { 0, -1, 1, -1 });
44923 RHS = DAG.getBitcast(MVT::v2i64, RHS);
44924 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
44925 }
44926
44927 return SDValue();
44928}
44929
44930static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
44931 TargetLowering::DAGCombinerInfo &DCI,
44932 const X86Subtarget &Subtarget) {
44933 EVT VT = N->getValueType(0);
44934 SDValue In = N->getOperand(0);
44935 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44936
44937 // Try to merge vector loads and extend_inreg to an extload.
44938 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
44939 In.hasOneUse()) {
44940 auto *Ld = cast<LoadSDNode>(In);
44941 if (Ld->isSimple()) {
44942 MVT SVT = In.getSimpleValueType().getVectorElementType();
44943 ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
44944 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
44945 VT.getVectorNumElements());
44946 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
44947 SDValue Load =
44948 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
44949 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
44950 Ld->getMemOperand()->getFlags());
44951 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
44952 return Load;
44953 }
44954 }
44955 }
44956
44957 // Attempt to combine as a shuffle.
44958 // TODO: SSE41 support
44959 if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
44960 SDValue Op(N, 0);
44961 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
44962 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44963 return Res;
44964 }
44965
44966 return SDValue();
44967}
44968
44969static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
44970 TargetLowering::DAGCombinerInfo &DCI) {
44971 EVT VT = N->getValueType(0);
44972
44973 APInt KnownUndef, KnownZero;
44974 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44975 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44976 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44977 KnownZero, DCI))
44978 return SDValue(N, 0);
44979
44980 return SDValue();
44981}
44982
44983SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
44984 DAGCombinerInfo &DCI) const {
44985 SelectionDAG &DAG = DCI.DAG;
44986 switch (N->getOpcode()) {
44987 default: break;
44988 case ISD::SCALAR_TO_VECTOR:
44989 return combineScalarToVector(N, DAG);
44990 case ISD::EXTRACT_VECTOR_ELT:
44991 case X86ISD::PEXTRW:
44992 case X86ISD::PEXTRB:
44993 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
44994 case ISD::CONCAT_VECTORS:
44995 return combineConcatVectors(N, DAG, DCI, Subtarget);
44996 case ISD::INSERT_SUBVECTOR:
44997 return combineInsertSubvector(N, DAG, DCI, Subtarget);
44998 case ISD::EXTRACT_SUBVECTOR:
44999 return combineExtractSubvector(N, DAG, DCI, Subtarget);
45000 case ISD::VSELECT:
45001 case ISD::SELECT:
45002 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
45003 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
45004 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
45005 case X86ISD::CMP: return combineCMP(N, DAG);
45006 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
45007 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
45008 case X86ISD::ADD:
45009 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
45010 case X86ISD::SBB: return combineSBB(N, DAG);
45011 case X86ISD::ADC: return combineADC(N, DAG, DCI);
45012 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
45013 case ISD::SHL: return combineShiftLeft(N, DAG);
45014 case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
45015 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
45016 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
45017 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
45018 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
45019 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
45020 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
45021 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
45022 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
45023 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
45024 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);
45025 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
45026 case ISD::FADD:
45027 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
45028 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
45029 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
45030 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
45031 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
45032 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
45033 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
45034 case X86ISD::FXOR:
45035 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
45036 case X86ISD::FMIN:
45037 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
45038 case ISD::FMINNUM:
45039 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
45040 case X86ISD::CVTSI2P:
45041 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
45042 case X86ISD::CVTP2SI:
45043 case X86ISD::CVTP2UI:
45044 case X86ISD::CVTTP2SI:
45045 case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
45046 case X86ISD::BT: return combineBT(N, DAG, DCI);
45047 case ISD::ANY_EXTEND:
45048 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
45049 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
45050 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
45051 case ISD::ANY_EXTEND_VECTOR_INREG:
45052 case ISD::SIGN_EXTEND_VECTOR_INREG:
45053 case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
45054 Subtarget);
45055 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
45056 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
45057 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
45058 case X86ISD::PACKSS:
45059 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
45060 case X86ISD::VSHL:
45061 case X86ISD::VSRA:
45062 case X86ISD::VSRL:
45063 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
45064 case X86ISD::VSHLI:
45065 case X86ISD::VSRAI:
45066 case X86ISD::VSRLI:
45067 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
45068 case X86ISD::PINSRB:
45069 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
45070 case X86ISD::SHUFP: // Handle all target specific shuffles
45071 case X86ISD::INSERTPS:
45072 case X86ISD::EXTRQI:
45073 case X86ISD::INSERTQI:
45074 case X86ISD::PALIGNR:
45075 case X86ISD::VSHLDQ:
45076 case X86ISD::VSRLDQ:
45077 case X86ISD::BLENDI:
45078 case X86ISD::UNPCKH:
45079 case X86ISD::UNPCKL:
45080 case X86ISD::MOVHLPS:
45081 case X86ISD::MOVLHPS:
45082 case X86ISD::PSHUFB:
45083 case X86ISD::PSHUFD:
45084 case X86ISD::PSHUFHW:
45085 case X86ISD::PSHUFLW:
45086 case X86ISD::MOVSHDUP:
45087 case X86ISD::MOVSLDUP:
45088 case X86ISD::MOVDDUP:
45089 case X86ISD::MOVSS:
45090 case X86ISD::MOVSD:
45091 case X86ISD::VBROADCAST:
45092 case X86ISD::VPPERM:
45093 case X86ISD::VPERMI:
45094 case X86ISD::VPERMV:
45095 case X86ISD::VPERMV3:
45096 case X86ISD::VPERMIL2:
45097 case X86ISD::VPERMILPI:
45098 case X86ISD::VPERMILPV:
45099 case X86ISD::VPERM2X128:
45100 case X86ISD::SHUF128:
45101 case X86ISD::VZEXT_MOVL:
45102 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
45103 case X86ISD::FMADD_RND:
45104 case X86ISD::FMSUB:
45105 case X86ISD::FMSUB_RND:
45106 case X86ISD::FNMADD:
45107 case X86ISD::FNMADD_RND:
45108 case X86ISD::FNMSUB:
45109 case X86ISD::FNMSUB_RND:
45110 case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
45111 case X86ISD::FMADDSUB_RND:
45112 case X86ISD::FMSUBADD_RND:
45113 case X86ISD::FMADDSUB:
45114 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
45115 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
45116 case X86ISD::MGATHER:
45117 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
45118 case ISD::MGATHER:
45119 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
45120 case X86ISD::PCMPEQ:
45121 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
45122 case X86ISD::PMULDQ:
45123 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
45124 case X86ISD::KSHIFTL:
45125 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
45126 }
45127
45128 return SDValue();
45129}
45130
45131bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
45132 if (!isTypeLegal(VT))
45133 return false;
45134
45135 // There are no vXi8 shifts.
45136 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
45137 return false;
45138
45139 // TODO: Almost no 8-bit ops are desirable because they have no actual
45140 // size/speed advantages vs. 32-bit ops, but they do have a major
45141 // potential disadvantage by causing partial register stalls.
45142 //
45143 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
45144 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
45145 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
45146 // check for a constant operand to the multiply.
45147 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
45148 return false;
45149
45150 // i16 instruction encodings are longer and some i16 instructions are slow,
45151 // so those are not desirable.
45152 if (VT == MVT::i16) {
45153 switch (Opc) {
45154 default:
45155 break;
45156 case ISD::LOAD:
45157 case ISD::SIGN_EXTEND:
45158 case ISD::ZERO_EXTEND:
45159 case ISD::ANY_EXTEND:
45160 case ISD::SHL:
45161 case ISD::SRA:
45162 case ISD::SRL:
45163 case ISD::SUB:
45164 case ISD::ADD:
45165 case ISD::MUL:
45166 case ISD::AND:
45167 case ISD::OR:
45168 case ISD::XOR:
45169 return false;
45170 }
45171 }
45172
45173 // Any legal type not explicitly accounted for above here is desirable.
45174 return true;
45175}
45176
45177SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
45178 SDValue Value, SDValue Addr,
45179 SelectionDAG &DAG) const {
45180 const Module *M = DAG.getMachineFunction().getMMI().getModule();
45181 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
45182 if (IsCFProtectionSupported) {
45183 // In case control-flow branch protection is enabled, we need to add
45184 // notrack prefix to the indirect branch.
45185 // In order to do that we create NT_BRIND SDNode.
45186 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
45187 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
45188 }
45189
45190 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
45191}
45192
45193bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
45194 EVT VT = Op.getValueType();
45195 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
45196 isa<ConstantSDNode>(Op.getOperand(1));
45197
45198 // i16 is legal, but undesirable since i16 instruction encodings are longer
45199 // and some i16 instructions are slow.
45200 // 8-bit multiply-by-constant can usually be expanded to something cheaper
45201 // using LEA and/or other ALU ops.
45202 if (VT != MVT::i16 && !Is8BitMulByConstant)
45203 return false;
45204
45205 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
45206 if (!Op.hasOneUse())
45207 return false;
45208 SDNode *User = *Op->use_begin();
45209 if (!ISD::isNormalStore(User))
45210 return false;
45211 auto *Ld = cast<LoadSDNode>(Load);
45212 auto *St = cast<StoreSDNode>(User);
45213 return Ld->getBasePtr() == St->getBasePtr();
45214 };
45215
45216 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
45217 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
45218 return false;
45219 if (!Op.hasOneUse())
45220 return false;
45221 SDNode *User = *Op->use_begin();
45222 if (User->getOpcode() != ISD::ATOMIC_STORE)
45223 return false;
45224 auto *Ld = cast<AtomicSDNode>(Load);
45225 auto *St = cast<AtomicSDNode>(User);
45226 return Ld->getBasePtr() == St->getBasePtr();
45227 };
45228
45229 bool Commute = false;
45230 switch (Op.getOpcode()) {
45231 default: return false;
45232 case ISD::SIGN_EXTEND:
45233 case ISD::ZERO_EXTEND:
45234 case ISD::ANY_EXTEND:
45235 break;
45236 case ISD::SHL:
45237 case ISD::SRA:
45238 case ISD::SRL: {
45239 SDValue N0 = Op.getOperand(0);
45240 // Look out for (store (shl (load), x)).
45241 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
45242 return false;
45243 break;
45244 }
45245 case ISD::ADD:
45246 case ISD::MUL:
45247 case ISD::AND:
45248 case ISD::OR:
45249 case ISD::XOR:
45250 Commute = true;
45251 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45252 case ISD::SUB: {
45253 SDValue N0 = Op.getOperand(0);
45254 SDValue N1 = Op.getOperand(1);
45255 // Avoid disabling potential load folding opportunities.
45256 if (MayFoldLoad(N1) &&
45257 (!Commute || !isa<ConstantSDNode>(N0) ||
45258 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
45259 return false;
45260 if (MayFoldLoad(N0) &&
45261 ((Commute && !isa<ConstantSDNode>(N1)) ||
45262 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
45263 return false;
45264 if (IsFoldableAtomicRMW(N0, Op) ||
45265 (Commute && IsFoldableAtomicRMW(N1, Op)))
45266 return false;
45267 }
45268 }
45269
45270 PVT = MVT::i32;
45271 return true;
45272}
45273
45274bool X86TargetLowering::
45275 isDesirableToCombineBuildVectorToShuffleTruncate(
45276 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
45277
45278 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&((SrcVT.getVectorNumElements() == ShuffleMask.size() &&
"Element count mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45279, __PRETTY_FUNCTION__))
45279 "Element count mismatch")((SrcVT.getVectorNumElements() == ShuffleMask.size() &&
"Element count mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT.getVectorNumElements() == ShuffleMask.size() && \"Element count mismatch\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45279, __PRETTY_FUNCTION__))
;
45280 assert(((Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask
, SrcVT) && "Shuffle Mask expected to be legal") ? static_cast
<void> (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45282, __PRETTY_FUNCTION__))
45281 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&((Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask
, SrcVT) && "Shuffle Mask expected to be legal") ? static_cast
<void> (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45282, __PRETTY_FUNCTION__))
45282 "Shuffle Mask expected to be legal")((Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask
, SrcVT) && "Shuffle Mask expected to be legal") ? static_cast
<void> (0) : __assert_fail ("Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && \"Shuffle Mask expected to be legal\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45282, __PRETTY_FUNCTION__))
;
45283
45284 // For 32-bit elements VPERMD is better than shuffle+truncate.
45285 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
45286 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
45287 return false;
45288
45289 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
45290 return false;
45291
45292 return true;
45293}
45294
45295//===----------------------------------------------------------------------===//
45296// X86 Inline Assembly Support
45297//===----------------------------------------------------------------------===//
45298
45299// Helper to match a string separated by whitespace.
45300static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
45301 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
45302
45303 for (StringRef Piece : Pieces) {
45304 if (!S.startswith(Piece)) // Check if the piece matches.
45305 return false;
45306
45307 S = S.substr(Piece.size());
45308 StringRef::size_type Pos = S.find_first_not_of(" \t");
45309 if (Pos == 0) // We matched a prefix.
45310 return false;
45311
45312 S = S.substr(Pos);
45313 }
45314
45315 return S.empty();
45316}
45317
45318static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
45319
45320 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
45321 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
45322 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
45323 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
45324
45325 if (AsmPieces.size() == 3)
45326 return true;
45327 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
45328 return true;
45329 }
45330 }
45331 return false;
45332}
45333
45334bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
45335 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
45336
45337 const std::string &AsmStr = IA->getAsmString();
45338
45339 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
45340 if (!Ty || Ty->getBitWidth() % 16 != 0)
45341 return false;
45342
45343 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
45344 SmallVector<StringRef, 4> AsmPieces;
45345 SplitString(AsmStr, AsmPieces, ";\n");
45346
45347 switch (AsmPieces.size()) {
45348 default: return false;
45349 case 1:
45350 // FIXME: this should verify that we are targeting a 486 or better. If not,
45351 // we will turn this bswap into something that will be lowered to logical
45352 // ops instead of emitting the bswap asm. For now, we don't support 486 or
45353 // lower so don't worry about this.
45354 // bswap $0
45355 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
45356 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
45357 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
45358 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
45359 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
45360 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
45361 // No need to check constraints, nothing other than the equivalent of
45362 // "=r,0" would be valid here.
45363 return IntrinsicLowering::LowerToByteSwap(CI);
45364 }
45365
45366 // rorw $$8, ${0:w} --> llvm.bswap.i16
45367 if (CI->getType()->isIntegerTy(16) &&
45368 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
45369 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
45370 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
45371 AsmPieces.clear();
45372 StringRef ConstraintsStr = IA->getConstraintString();
45373 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
45374 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
45375 if (clobbersFlagRegisters(AsmPieces))
45376 return IntrinsicLowering::LowerToByteSwap(CI);
45377 }
45378 break;
45379 case 3:
45380 if (CI->getType()->isIntegerTy(32) &&
45381 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
45382 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
45383 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
45384 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
45385 AsmPieces.clear();
45386 StringRef ConstraintsStr = IA->getConstraintString();
45387 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
45388 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
45389 if (clobbersFlagRegisters(AsmPieces))
45390 return IntrinsicLowering::LowerToByteSwap(CI);
45391 }
45392
45393 if (CI->getType()->isIntegerTy(64)) {
45394 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
45395 if (Constraints.size() >= 2 &&
45396 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
45397 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
45398 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
45399 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
45400 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
45401 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
45402 return IntrinsicLowering::LowerToByteSwap(CI);
45403 }
45404 }
45405 break;
45406 }
45407 return false;
45408}
45409
45410static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
45411 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
45412 .Case("{@cca}", X86::COND_A)
45413 .Case("{@ccae}", X86::COND_AE)
45414 .Case("{@ccb}", X86::COND_B)
45415 .Case("{@ccbe}", X86::COND_BE)
45416 .Case("{@ccc}", X86::COND_B)
45417 .Case("{@cce}", X86::COND_E)
45418 .Case("{@ccz}", X86::COND_E)
45419 .Case("{@ccg}", X86::COND_G)
45420 .Case("{@ccge}", X86::COND_GE)
45421 .Case("{@ccl}", X86::COND_L)
45422 .Case("{@ccle}", X86::COND_LE)
45423 .Case("{@ccna}", X86::COND_BE)
45424 .Case("{@ccnae}", X86::COND_B)
45425 .Case("{@ccnb}", X86::COND_AE)
45426 .Case("{@ccnbe}", X86::COND_A)
45427 .Case("{@ccnc}", X86::COND_AE)
45428 .Case("{@ccne}", X86::COND_NE)
45429 .Case("{@ccnz}", X86::COND_NE)
45430 .Case("{@ccng}", X86::COND_LE)
45431 .Case("{@ccnge}", X86::COND_L)
45432 .Case("{@ccnl}", X86::COND_GE)
45433 .Case("{@ccnle}", X86::COND_G)
45434 .Case("{@ccno}", X86::COND_NO)
45435 .Case("{@ccnp}", X86::COND_P)
45436 .Case("{@ccns}", X86::COND_NS)
45437 .Case("{@cco}", X86::COND_O)
45438 .Case("{@ccp}", X86::COND_P)
45439 .Case("{@ccs}", X86::COND_S)
45440 .Default(X86::COND_INVALID);
45441 return Cond;
45442}
45443
45444/// Given a constraint letter, return the type of constraint for this target.
45445X86TargetLowering::ConstraintType
45446X86TargetLowering::getConstraintType(StringRef Constraint) const {
45447 if (Constraint.size() == 1) {
45448 switch (Constraint[0]) {
45449 case 'R':
45450 case 'q':
45451 case 'Q':
45452 case 'f':
45453 case 't':
45454 case 'u':
45455 case 'y':
45456 case 'x':
45457 case 'v':
45458 case 'Y':
45459 case 'l':
45460 case 'k': // AVX512 masking registers.
45461 return C_RegisterClass;
45462 case 'a':
45463 case 'b':
45464 case 'c':
45465 case 'd':
45466 case 'S':
45467 case 'D':
45468 case 'A':
45469 return C_Register;
45470 case 'I':
45471 case 'J':
45472 case 'K':
45473 case 'N':
45474 case 'G':
45475 case 'L':
45476 case 'M':
45477 return C_Immediate;
45478 case 'C':
45479 case 'e':
45480 case 'Z':
45481 return C_Other;
45482 default:
45483 break;
45484 }
45485 }
45486 else if (Constraint.size() == 2) {
45487 switch (Constraint[0]) {
45488 default:
45489 break;
45490 case 'Y':
45491 switch (Constraint[1]) {
45492 default:
45493 break;
45494 case 'z':
45495 case '0':
45496 return C_Register;
45497 case 'i':
45498 case 'm':
45499 case 'k':
45500 case 't':
45501 case '2':
45502 return C_RegisterClass;
45503 }
45504 }
45505 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
45506 return C_Other;
45507 return TargetLowering::getConstraintType(Constraint);
45508}
45509
45510/// Examine constraint type and operand type and determine a weight value.
45511/// This object must already have been set up with the operand type
45512/// and the current alternative constraint selected.
45513TargetLowering::ConstraintWeight
45514 X86TargetLowering::getSingleConstraintMatchWeight(
45515 AsmOperandInfo &info, const char *constraint) const {
45516 ConstraintWeight weight = CW_Invalid;
45517 Value *CallOperandVal = info.CallOperandVal;
45518 // If we don't have a value, we can't do a match,
45519 // but allow it at the lowest weight.
45520 if (!CallOperandVal)
45521 return CW_Default;
45522 Type *type = CallOperandVal->getType();
45523 // Look at the constraint type.
45524 switch (*constraint) {
45525 default:
45526 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
45527 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45528 case 'R':
45529 case 'q':
45530 case 'Q':
45531 case 'a':
45532 case 'b':
45533 case 'c':
45534 case 'd':
45535 case 'S':
45536 case 'D':
45537 case 'A':
45538 if (CallOperandVal->getType()->isIntegerTy())
45539 weight = CW_SpecificReg;
45540 break;
45541 case 'f':
45542 case 't':
45543 case 'u':
45544 if (type->isFloatingPointTy())
45545 weight = CW_SpecificReg;
45546 break;
45547 case 'y':
45548 if (type->isX86_MMXTy() && Subtarget.hasMMX())
45549 weight = CW_SpecificReg;
45550 break;
45551 case 'Y': {
45552 unsigned Size = StringRef(constraint).size();
45553 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
45554 char NextChar = Size == 2 ? constraint[1] : 'i';
45555 if (Size > 2)
45556 break;
45557 switch (NextChar) {
45558 default:
45559 return CW_Invalid;
45560 // XMM0
45561 case 'z':
45562 case '0':
45563 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
45564 return CW_SpecificReg;
45565 return CW_Invalid;
45566 // Conditional OpMask regs (AVX512)
45567 case 'k':
45568 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
45569 return CW_Register;
45570 return CW_Invalid;
45571 // Any MMX reg
45572 case 'm':
45573 if (type->isX86_MMXTy() && Subtarget.hasMMX())
45574 return weight;
45575 return CW_Invalid;
45576 // Any SSE reg when ISA >= SSE2, same as 'Y'
45577 case 'i':
45578 case 't':
45579 case '2':
45580 if (!Subtarget.hasSSE2())
45581 return CW_Invalid;
45582 break;
45583 }
45584 // Fall through (handle "Y" constraint).
45585 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45586 }
45587 case 'v':
45588 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
45589 weight = CW_Register;
45590 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45591 case 'x':
45592 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
45593 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
45594 weight = CW_Register;
45595 break;
45596 case 'k':
45597 // Enable conditional vector operations using %k<#> registers.
45598 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
45599 weight = CW_Register;
45600 break;
45601 case 'I':
45602 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
45603 if (C->getZExtValue() <= 31)
45604 weight = CW_Constant;
45605 }
45606 break;
45607 case 'J':
45608 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45609 if (C->getZExtValue() <= 63)
45610 weight = CW_Constant;
45611 }
45612 break;
45613 case 'K':
45614 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45615 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
45616 weight = CW_Constant;
45617 }
45618 break;
45619 case 'L':
45620 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45621 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
45622 weight = CW_Constant;
45623 }
45624 break;
45625 case 'M':
45626 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45627 if (C->getZExtValue() <= 3)
45628 weight = CW_Constant;
45629 }
45630 break;
45631 case 'N':
45632 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45633 if (C->getZExtValue() <= 0xff)
45634 weight = CW_Constant;
45635 }
45636 break;
45637 case 'G':
45638 case 'C':
45639 if (isa<ConstantFP>(CallOperandVal)) {
45640 weight = CW_Constant;
45641 }
45642 break;
45643 case 'e':
45644 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45645 if ((C->getSExtValue() >= -0x80000000LL) &&
45646 (C->getSExtValue() <= 0x7fffffffLL))
45647 weight = CW_Constant;
45648 }
45649 break;
45650 case 'Z':
45651 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
45652 if (C->getZExtValue() <= 0xffffffff)
45653 weight = CW_Constant;
45654 }
45655 break;
45656 }
45657 return weight;
45658}
45659
45660/// Try to replace an X constraint, which matches anything, with another that
45661/// has more specific requirements based on the type of the corresponding
45662/// operand.
45663const char *X86TargetLowering::
45664LowerXConstraint(EVT ConstraintVT) const {
45665 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
45666 // 'f' like normal targets.
45667 if (ConstraintVT.isFloatingPoint()) {
45668 if (Subtarget.hasSSE2())
45669 return "Y";
45670 if (Subtarget.hasSSE1())
45671 return "x";
45672 }
45673
45674 return TargetLowering::LowerXConstraint(ConstraintVT);
45675}
45676
45677// Lower @cc targets via setcc.
45678SDValue X86TargetLowering::LowerAsmOutputForConstraint(
45679 SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
45680 SelectionDAG &DAG) const {
45681 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
45682 if (Cond == X86::COND_INVALID)
45683 return SDValue();
45684 // Check that return type is valid.
45685 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
45686 OpInfo.ConstraintVT.getSizeInBits() < 8)
45687 report_fatal_error("Flag output operand is of invalid type");
45688
45689 // Get EFLAGS register. Only update chain when copyfrom is glued.
45690 if (Flag.getNode()) {
45691 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
45692 Chain = Flag.getValue(1);
45693 } else
45694 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
45695 // Extract CC code.
45696 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
45697 // Extend to 32-bits
45698 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
45699
45700 return Result;
45701}
45702
45703/// Lower the specified operand into the Ops vector.
45704/// If it is invalid, don't add anything to Ops.
45705void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
45706 std::string &Constraint,
45707 std::vector<SDValue>&Ops,
45708 SelectionDAG &DAG) const {
45709 SDValue Result;
45710
45711 // Only support length 1 constraints for now.
45712 if (Constraint.length() > 1) return;
45713
45714 char ConstraintLetter = Constraint[0];
45715 switch (ConstraintLetter) {
45716 default: break;
45717 case 'I':
45718 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45719 if (C->getZExtValue() <= 31) {
45720 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45721 Op.getValueType());
45722 break;
45723 }
45724 }
45725 return;
45726 case 'J':
45727 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45728 if (C->getZExtValue() <= 63) {
45729 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45730 Op.getValueType());
45731 break;
45732 }
45733 }
45734 return;
45735 case 'K':
45736 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45737 if (isInt<8>(C->getSExtValue())) {
45738 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45739 Op.getValueType());
45740 break;
45741 }
45742 }
45743 return;
45744 case 'L':
45745 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45746 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
45747 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
45748 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
45749 Op.getValueType());
45750 break;
45751 }
45752 }
45753 return;
45754 case 'M':
45755 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45756 if (C->getZExtValue() <= 3) {
45757 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45758 Op.getValueType());
45759 break;
45760 }
45761 }
45762 return;
45763 case 'N':
45764 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45765 if (C->getZExtValue() <= 255) {
45766 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45767 Op.getValueType());
45768 break;
45769 }
45770 }
45771 return;
45772 case 'O':
45773 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45774 if (C->getZExtValue() <= 127) {
45775 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45776 Op.getValueType());
45777 break;
45778 }
45779 }
45780 return;
45781 case 'e': {
45782 // 32-bit signed value
45783 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45784 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
45785 C->getSExtValue())) {
45786 // Widen to 64 bits here to get it sign extended.
45787 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
45788 break;
45789 }
45790 // FIXME gcc accepts some relocatable values here too, but only in certain
45791 // memory models; it's complicated.
45792 }
45793 return;
45794 }
45795 case 'Z': {
45796 // 32-bit unsigned value
45797 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
45798 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
45799 C->getZExtValue())) {
45800 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
45801 Op.getValueType());
45802 break;
45803 }
45804 }
45805 // FIXME gcc accepts some relocatable values here too, but only in certain
45806 // memory models; it's complicated.
45807 return;
45808 }
45809 case 'i': {
45810 // Literal immediates are always ok.
45811 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
45812 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
45813 BooleanContent BCont = getBooleanContents(MVT::i64);
45814 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
45815 : ISD::SIGN_EXTEND;
45816 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
45817 : CST->getSExtValue();
45818 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
45819 break;
45820 }
45821
45822 // In any sort of PIC mode addresses need to be computed at runtime by
45823 // adding in a register or some sort of table lookup. These can't
45824 // be used as immediates.
45825 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
45826 return;
45827
45828 // If we are in non-pic codegen mode, we allow the address of a global (with
45829 // an optional displacement) to be used with 'i'.
45830 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
45831 // If we require an extra load to get this address, as in PIC mode, we
45832 // can't accept it.
45833 if (isGlobalStubReference(
45834 Subtarget.classifyGlobalReference(GA->getGlobal())))
45835 return;
45836 break;
45837 }
45838 }
45839
45840 if (Result.getNode()) {
45841 Ops.push_back(Result);
45842 return;
45843 }
45844 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
45845}
45846
45847/// Check if \p RC is a general purpose register class.
45848/// I.e., GR* or one of their variant.
45849static bool isGRClass(const TargetRegisterClass &RC) {
45850 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
45851 RC.hasSuperClassEq(&X86::GR16RegClass) ||
45852 RC.hasSuperClassEq(&X86::GR32RegClass) ||
45853 RC.hasSuperClassEq(&X86::GR64RegClass) ||
45854 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
45855}
45856
45857/// Check if \p RC is a vector register class.
45858/// I.e., FR* / VR* or one of their variant.
45859static bool isFRClass(const TargetRegisterClass &RC) {
45860 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
45861 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
45862 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
45863 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
45864 RC.hasSuperClassEq(&X86::VR512RegClass);
45865}
45866
45867/// Check if \p RC is a mask register class.
45868/// I.e., VK* or one of their variant.
45869static bool isVKClass(const TargetRegisterClass &RC) {
45870 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
45871 RC.hasSuperClassEq(&X86::VK2RegClass) ||
45872 RC.hasSuperClassEq(&X86::VK4RegClass) ||
45873 RC.hasSuperClassEq(&X86::VK8RegClass) ||
45874 RC.hasSuperClassEq(&X86::VK16RegClass) ||
45875 RC.hasSuperClassEq(&X86::VK32RegClass) ||
45876 RC.hasSuperClassEq(&X86::VK64RegClass);
45877}
45878
45879std::pair<unsigned, const TargetRegisterClass *>
45880X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
45881 StringRef Constraint,
45882 MVT VT) const {
45883 // First, see if this is a constraint that directly corresponds to an LLVM
45884 // register class.
45885 if (Constraint.size() == 1) {
45886 // GCC Constraint Letters
45887 switch (Constraint[0]) {
45888 default: break;
45889 // 'A' means [ER]AX + [ER]DX.
45890 case 'A':
45891 if (Subtarget.is64Bit())
45892 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
45893 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45894, __PRETTY_FUNCTION__))
45894 "Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45894, __PRETTY_FUNCTION__))
;
45895 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
45896
45897 // TODO: Slight differences here in allocation order and leaving
45898 // RIP in the class. Do they matter any more here than they do
45899 // in the normal allocation?
45900 case 'k':
45901 if (Subtarget.hasAVX512()) {
45902 if (VT == MVT::i1)
45903 return std::make_pair(0U, &X86::VK1RegClass);
45904 if (VT == MVT::i8)
45905 return std::make_pair(0U, &X86::VK8RegClass);
45906 if (VT == MVT::i16)
45907 return std::make_pair(0U, &X86::VK16RegClass);
45908 }
45909 if (Subtarget.hasBWI()) {
45910 if (VT == MVT::i32)
45911 return std::make_pair(0U, &X86::VK32RegClass);
45912 if (VT == MVT::i64)
45913 return std::make_pair(0U, &X86::VK64RegClass);
45914 }
45915 break;
45916 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
45917 if (Subtarget.is64Bit()) {
45918 if (VT == MVT::i32 || VT == MVT::f32)
45919 return std::make_pair(0U, &X86::GR32RegClass);
45920 if (VT == MVT::i16)
45921 return std::make_pair(0U, &X86::GR16RegClass);
45922 if (VT == MVT::i8 || VT == MVT::i1)
45923 return std::make_pair(0U, &X86::GR8RegClass);
45924 if (VT == MVT::i64 || VT == MVT::f64)
45925 return std::make_pair(0U, &X86::GR64RegClass);
45926 break;
45927 }
45928 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45929 // 32-bit fallthrough
45930 case 'Q': // Q_REGS
45931 if (VT == MVT::i32 || VT == MVT::f32)
45932 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
45933 if (VT == MVT::i16)
45934 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
45935 if (VT == MVT::i8 || VT == MVT::i1)
45936 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
45937 if (VT == MVT::i64)
45938 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
45939 break;
45940 case 'r': // GENERAL_REGS
45941 case 'l': // INDEX_REGS
45942 if (VT == MVT::i8 || VT == MVT::i1)
45943 return std::make_pair(0U, &X86::GR8RegClass);
45944 if (VT == MVT::i16)
45945 return std::make_pair(0U, &X86::GR16RegClass);
45946 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
45947 return std::make_pair(0U, &X86::GR32RegClass);
45948 return std::make_pair(0U, &X86::GR64RegClass);
45949 case 'R': // LEGACY_REGS
45950 if (VT == MVT::i8 || VT == MVT::i1)
45951 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
45952 if (VT == MVT::i16)
45953 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
45954 if (VT == MVT::i32 || !Subtarget.is64Bit())
45955 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
45956 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
45957 case 'f': // FP Stack registers.
45958 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
45959 // value to the correct fpstack register class.
45960 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
45961 return std::make_pair(0U, &X86::RFP32RegClass);
45962 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
45963 return std::make_pair(0U, &X86::RFP64RegClass);
45964 return std::make_pair(0U, &X86::RFP80RegClass);
45965 case 'y': // MMX_REGS if MMX allowed.
45966 if (!Subtarget.hasMMX()) break;
45967 return std::make_pair(0U, &X86::VR64RegClass);
45968 case 'Y': // SSE_REGS if SSE2 allowed
45969 if (!Subtarget.hasSSE2()) break;
45970 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45971 case 'v':
45972 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
45973 if (!Subtarget.hasSSE1()) break;
45974 bool VConstraint = (Constraint[0] == 'v');
45975
45976 switch (VT.SimpleTy) {
45977 default: break;
45978 // Scalar SSE types.
45979 case MVT::f32:
45980 case MVT::i32:
45981 if (VConstraint && Subtarget.hasVLX())
45982 return std::make_pair(0U, &X86::FR32XRegClass);
45983 return std::make_pair(0U, &X86::FR32RegClass);
45984 case MVT::f64:
45985 case MVT::i64:
45986 if (VConstraint && Subtarget.hasVLX())
45987 return std::make_pair(0U, &X86::FR64XRegClass);
45988 return std::make_pair(0U, &X86::FR64RegClass);
45989 // TODO: Handle i128 in FR128RegClass after it is tested well.
45990 // Vector types and fp128.
45991 case MVT::f128:
45992 case MVT::v16i8:
45993 case MVT::v8i16:
45994 case MVT::v4i32:
45995 case MVT::v2i64:
45996 case MVT::v4f32:
45997 case MVT::v2f64:
45998 if (VConstraint && Subtarget.hasVLX())
45999 return std::make_pair(0U, &X86::VR128XRegClass);
46000 return std::make_pair(0U, &X86::VR128RegClass);
46001 // AVX types.
46002 case MVT::v32i8:
46003 case MVT::v16i16:
46004 case MVT::v8i32:
46005 case MVT::v4i64:
46006 case MVT::v8f32:
46007 case MVT::v4f64:
46008 if (VConstraint && Subtarget.hasVLX())
46009 return std::make_pair(0U, &X86::VR256XRegClass);
46010 if (Subtarget.hasAVX())
46011 return std::make_pair(0U, &X86::VR256RegClass);
46012 break;
46013 case MVT::v8f64:
46014 case MVT::v16f32:
46015 case MVT::v16i32:
46016 case MVT::v8i64:
46017 if (!Subtarget.hasAVX512()) break;
46018 if (VConstraint)
46019 return std::make_pair(0U, &X86::VR512RegClass);
46020 return std::make_pair(0U, &X86::VR512_0_15RegClass);
46021 }
46022 break;
46023 }
46024 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
46025 switch (Constraint[1]) {
46026 default:
46027 break;
46028 case 'i':
46029 case 't':
46030 case '2':
46031 return getRegForInlineAsmConstraint(TRI, "Y", VT);
46032 case 'm':
46033 if (!Subtarget.hasMMX()) break;
46034 return std::make_pair(0U, &X86::VR64RegClass);
46035 case 'z':
46036 case '0':
46037 if (!Subtarget.hasSSE1()) break;
46038 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
46039 case 'k':
46040 // This register class doesn't allocate k0 for masked vector operation.
46041 if (Subtarget.hasAVX512()) {
46042 if (VT == MVT::i1)
46043 return std::make_pair(0U, &X86::VK1WMRegClass);
46044 if (VT == MVT::i8)
46045 return std::make_pair(0U, &X86::VK8WMRegClass);
46046 if (VT == MVT::i16)
46047 return std::make_pair(0U, &X86::VK16WMRegClass);
46048 }
46049 if (Subtarget.hasBWI()) {
46050 if (VT == MVT::i32)
46051 return std::make_pair(0U, &X86::VK32WMRegClass);
46052 if (VT == MVT::i64)
46053 return std::make_pair(0U, &X86::VK64WMRegClass);
46054 }
46055 break;
46056 }
46057 }
46058
46059 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
46060 return std::make_pair(0U, &X86::GR32RegClass);
46061
46062 // Use the default implementation in TargetLowering to convert the register
46063 // constraint into a member of a register class.
46064 std::pair<unsigned, const TargetRegisterClass*> Res;
46065 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
46066
46067 // Not found as a standard register?
46068 if (!Res.second) {
46069 // Map st(0) -> st(7) -> ST0
46070 if (Constraint.size() == 7 && Constraint[0] == '{' &&
46071 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
46072 Constraint[3] == '(' &&
46073 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
46074 Constraint[5] == ')' && Constraint[6] == '}') {
46075 // st(7) is not allocatable and thus not a member of RFP80. Return
46076 // singleton class in cases where we have a reference to it.
46077 if (Constraint[4] == '7')
46078 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
46079 return std::make_pair(X86::FP0 + Constraint[4] - '0',
46080 &X86::RFP80RegClass);
46081 }
46082
46083 // GCC allows "st(0)" to be called just plain "st".
46084 if (StringRef("{st}").equals_lower(Constraint))
46085 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
46086
46087 // flags -> EFLAGS
46088 if (StringRef("{flags}").equals_lower(Constraint))
46089 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
46090
46091 // dirflag -> DF
46092 if (StringRef("{dirflag}").equals_lower(Constraint))
46093 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
46094
46095 // fpsr -> FPSW
46096 if (StringRef("{fpsr}").equals_lower(Constraint))
46097 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
46098
46099 return Res;
46100 }
46101
46102 // Make sure it isn't a register that requires 64-bit mode.
46103 if (!Subtarget.is64Bit() &&
46104 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
46105 TRI->getEncodingValue(Res.first) >= 8) {
46106 // Register requires REX prefix, but we're in 32-bit mode.
46107 return std::make_pair(0, nullptr);
46108 }
46109
46110 // Make sure it isn't a register that requires AVX512.
46111 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
46112 TRI->getEncodingValue(Res.first) & 0x10) {
46113 // Register requires EVEX prefix.
46114 return std::make_pair(0, nullptr);
46115 }
46116
46117 // Otherwise, check to see if this is a register class of the wrong value
46118 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
46119 // turn into {ax},{dx}.
46120 // MVT::Other is used to specify clobber names.
46121 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
46122 return Res; // Correct type already, nothing to do.
46123
46124 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
46125 // return "eax". This should even work for things like getting 64bit integer
46126 // registers when given an f64 type.
46127 const TargetRegisterClass *Class = Res.second;
46128 // The generic code will match the first register class that contains the
46129 // given register. Thus, based on the ordering of the tablegened file,
46130 // the "plain" GR classes might not come first.
46131 // Therefore, use a helper method.
46132 if (isGRClass(*Class)) {
46133 unsigned Size = VT.getSizeInBits();
46134 if (Size == 1) Size = 8;
46135 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
46136 if (DestReg > 0) {
46137 bool is64Bit = Subtarget.is64Bit();
46138 const TargetRegisterClass *RC =
46139 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
46140 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
46141 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
46142 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
46143 : nullptr;
46144 if (Size == 64 && !is64Bit) {
46145 // Model GCC's behavior here and select a fixed pair of 32-bit
46146 // registers.
46147 switch (DestReg) {
46148 case X86::RAX:
46149 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
46150 case X86::RDX:
46151 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
46152 case X86::RCX:
46153 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
46154 case X86::RBX:
46155 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
46156 case X86::RSI:
46157 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
46158 case X86::RDI:
46159 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
46160 case X86::RBP:
46161 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
46162 default:
46163 return std::make_pair(0, nullptr);
46164 }
46165 }
46166 if (RC && RC->contains(DestReg))
46167 return std::make_pair(DestReg, RC);
46168 return Res;
46169 }
46170 // No register found/type mismatch.
46171 return std::make_pair(0, nullptr);
46172 } else if (isFRClass(*Class)) {
46173 // Handle references to XMM physical registers that got mapped into the
46174 // wrong class. This can happen with constraints like {xmm0} where the
46175 // target independent register mapper will just pick the first match it can
46176 // find, ignoring the required type.
46177
46178 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
46179 if (VT == MVT::f32 || VT == MVT::i32)
46180 Res.second = &X86::FR32XRegClass;
46181 else if (VT == MVT::f64 || VT == MVT::i64)
46182 Res.second = &X86::FR64XRegClass;
46183 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
46184 Res.second = &X86::VR128XRegClass;
46185 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
46186 Res.second = &X86::VR256XRegClass;
46187 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
46188 Res.second = &X86::VR512RegClass;
46189 else {
46190 // Type mismatch and not a clobber: Return an error;
46191 Res.first = 0;
46192 Res.second = nullptr;
46193 }
46194 } else if (isVKClass(*Class)) {
46195 if (VT == MVT::i1)
46196 Res.second = &X86::VK1RegClass;
46197 else if (VT == MVT::i8)
46198 Res.second = &X86::VK8RegClass;
46199 else if (VT == MVT::i16)
46200 Res.second = &X86::VK16RegClass;
46201 else if (VT == MVT::i32)
46202 Res.second = &X86::VK32RegClass;
46203 else if (VT == MVT::i64)
46204 Res.second = &X86::VK64RegClass;
46205 else {
46206 // Type mismatch and not a clobber: Return an error;
46207 Res.first = 0;
46208 Res.second = nullptr;
46209 }
46210 }
46211
46212 return Res;
46213}
46214
46215int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
46216 const AddrMode &AM, Type *Ty,
46217 unsigned AS) const {
46218 // Scaling factors are not free at all.
46219 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
46220 // will take 2 allocations in the out of order engine instead of 1
46221 // for plain addressing mode, i.e. inst (reg1).
46222 // E.g.,
46223 // vaddps (%rsi,%rdx), %ymm0, %ymm1
46224 // Requires two allocations (one for the load, one for the computation)
46225 // whereas:
46226 // vaddps (%rsi), %ymm0, %ymm1
46227 // Requires just 1 allocation, i.e., freeing allocations for other operations
46228 // and having less micro operations to execute.
46229 //
46230 // For some X86 architectures, this is even worse because for instance for
46231 // stores, the complex addressing mode forces the instruction to use the
46232 // "load" ports instead of the dedicated "store" port.
46233 // E.g., on Haswell:
46234 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
46235 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
46236 if (isLegalAddressingMode(DL, AM, Ty, AS))
46237 // Scale represents reg2 * scale, thus account for 1
46238 // as soon as we use a second register.
46239 return AM.Scale != 0;
46240 return -1;
46241}
46242
46243bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
46244 // Integer division on x86 is expensive. However, when aggressively optimizing
46245 // for code size, we prefer to use a div instruction, as it is usually smaller
46246 // than the alternative sequence.
46247 // The exception to this is vector division. Since x86 doesn't have vector
46248 // integer division, leaving the division as-is is a loss even in terms of
46249 // size, because it will have to be scalarized, while the alternative code
46250 // sequence can be performed in vector form.
46251 bool OptSize =
46252 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
46253 return OptSize && !VT.isVector();
46254}
46255
46256void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
46257 if (!Subtarget.is64Bit())
46258 return;
46259
46260 // Update IsSplitCSR in X86MachineFunctionInfo.
46261 X86MachineFunctionInfo *AFI =
46262 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
46263 AFI->setIsSplitCSR(true);
46264}
46265
46266void X86TargetLowering::insertCopiesSplitCSR(
46267 MachineBasicBlock *Entry,
46268 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
46269 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
46270 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
46271 if (!IStart)
46272 return;
46273
46274 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
46275 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
46276 MachineBasicBlock::iterator MBBI = Entry->begin();
46277 for (const MCPhysReg *I = IStart; *I; ++I) {
46278 const TargetRegisterClass *RC = nullptr;
46279 if (X86::GR64RegClass.contains(*I))
46280 RC = &X86::GR64RegClass;
46281 else
46282 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46282)
;
46283
46284 Register NewVR = MRI->createVirtualRegister(RC);
46285 // Create copy from CSR to a virtual register.
46286 // FIXME: this currently does not emit CFI pseudo-instructions, it works
46287 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
46288 // nounwind. If we want to generalize this later, we may need to emit
46289 // CFI pseudo-instructions.
46290 assert(((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46292, __PRETTY_FUNCTION__))
46291 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46292, __PRETTY_FUNCTION__))
46292 "Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46292, __PRETTY_FUNCTION__))
;
46293 Entry->addLiveIn(*I);
46294 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
46295 .addReg(*I);
46296
46297 // Insert the copy-back instructions right before the terminator.
46298 for (auto *Exit : Exits)
46299 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
46300 TII->get(TargetOpcode::COPY), *I)
46301 .addReg(NewVR);
46302 }
46303}
46304
46305bool X86TargetLowering::supportSwiftError() const {
46306 return Subtarget.is64Bit();
46307}
46308
46309/// Returns the name of the symbol used to emit stack probes or the empty
46310/// string if not applicable.
46311StringRef
46312X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
46313 // If the function specifically requests stack probes, emit them.
46314 if (MF.getFunction().hasFnAttribute("probe-stack"))
46315 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
46316
46317 // Generally, if we aren't on Windows, the platform ABI does not include
46318 // support for stack probes, so don't emit them.
46319 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
46320 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
46321 return "";
46322
46323 // We need a stack probe to conform to the Windows ABI. Choose the right
46324 // symbol.
46325 if (Subtarget.is64Bit())
46326 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
46327 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
46328}
46329
46330unsigned
46331X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
46332 // The default stack probe size is 4096 if the function has no stackprobesize
46333 // attribute.
46334 unsigned StackProbeSize = 4096;
46335 const Function &Fn = MF.getFunction();
46336 if (Fn.hasFnAttribute("stack-probe-size"))
46337 Fn.getFnAttribute("stack-probe-size")
46338 .getValueAsString()
46339 .getAsInteger(0, StackProbeSize);
46340 return StackProbeSize;
46341}

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h

1//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of machine-level target independent types which
10// legal values in the code generator use.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
15#define LLVM_SUPPORT_MACHINEVALUETYPE_H
16
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/TypeSize.h"
21#include <cassert>
22
23namespace llvm {
24
25 class Type;
26
27 /// Machine Value Type. Every type that is supported natively by some
28 /// processor targeted by LLVM occurs here. This means that any legal value
29 /// type can be represented by an MVT.
30 class MVT {
31 public:
32 enum SimpleValueType : uint8_t {
33 // Simple value types that aren't explicitly part of this enumeration
34 // are considered extended value types.
35 INVALID_SIMPLE_VALUE_TYPE = 0,
36
37 // If you change this numbering, you must change the values in
38 // ValueTypes.td as well!
39 Other = 1, // This is a non-standard value
40 i1 = 2, // This is a 1 bit integer value
41 i8 = 3, // This is an 8 bit integer value
42 i16 = 4, // This is a 16 bit integer value
43 i32 = 5, // This is a 32 bit integer value
44 i64 = 6, // This is a 64 bit integer value
45 i128 = 7, // This is a 128 bit integer value
46
47 FIRST_INTEGER_VALUETYPE = i1,
48 LAST_INTEGER_VALUETYPE = i128,
49
50 f16 = 8, // This is a 16 bit floating point value
51 f32 = 9, // This is a 32 bit floating point value
52 f64 = 10, // This is a 64 bit floating point value
53 f80 = 11, // This is a 80 bit floating point value
54 f128 = 12, // This is a 128 bit floating point value
55 ppcf128 = 13, // This is a PPC 128-bit floating point value
56
57 FIRST_FP_VALUETYPE = f16,
58 LAST_FP_VALUETYPE = ppcf128,
59
60 v1i1 = 14, // 1 x i1
61 v2i1 = 15, // 2 x i1
62 v4i1 = 16, // 4 x i1
63 v8i1 = 17, // 8 x i1
64 v16i1 = 18, // 16 x i1
65 v32i1 = 19, // 32 x i1
66 v64i1 = 20, // 64 x i1
67 v128i1 = 21, // 128 x i1
68 v256i1 = 22, // 256 x i1
69 v512i1 = 23, // 512 x i1
70 v1024i1 = 24, // 1024 x i1
71
72 v1i8 = 25, // 1 x i8
73 v2i8 = 26, // 2 x i8
74 v4i8 = 27, // 4 x i8
75 v8i8 = 28, // 8 x i8
76 v16i8 = 29, // 16 x i8
77 v32i8 = 30, // 32 x i8
78 v64i8 = 31, // 64 x i8
79 v128i8 = 32, //128 x i8
80 v256i8 = 33, //256 x i8
81
82 v1i16 = 34, // 1 x i16
83 v2i16 = 35, // 2 x i16
84 v3i16 = 36, // 3 x i16
85 v4i16 = 37, // 4 x i16
86 v8i16 = 38, // 8 x i16
87 v16i16 = 39, // 16 x i16
88 v32i16 = 40, // 32 x i16
89 v64i16 = 41, // 64 x i16
90 v128i16 = 42, //128 x i16
91
92 v1i32 = 43, // 1 x i32
93 v2i32 = 44, // 2 x i32
94 v3i32 = 45, // 3 x i32
95 v4i32 = 46, // 4 x i32
96 v5i32 = 47, // 5 x i32
97 v8i32 = 48, // 8 x i32
98 v16i32 = 49, // 16 x i32
99 v32i32 = 50, // 32 x i32
100 v64i32 = 51, // 64 x i32
101 v128i32 = 52, // 128 x i32
102 v256i32 = 53, // 256 x i32
103 v512i32 = 54, // 512 x i32
104 v1024i32 = 55, // 1024 x i32
105 v2048i32 = 56, // 2048 x i32
106
107 v1i64 = 57, // 1 x i64
108 v2i64 = 58, // 2 x i64
109 v4i64 = 59, // 4 x i64
110 v8i64 = 60, // 8 x i64
111 v16i64 = 61, // 16 x i64
112 v32i64 = 62, // 32 x i64
113
114 v1i128 = 63, // 1 x i128
115
116 FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
117 LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
118
119 v2f16 = 64, // 2 x f16
120 v3f16 = 65, // 3 x f16
121 v4f16 = 66, // 4 x f16
122 v8f16 = 67, // 8 x f16
123 v16f16 = 68, // 16 x f16
124 v32f16 = 69, // 32 x f16
125 v1f32 = 70, // 1 x f32
126 v2f32 = 71, // 2 x f32
127 v3f32 = 72, // 3 x f32
128 v4f32 = 73, // 4 x f32
129 v5f32 = 74, // 5 x f32
130 v8f32 = 75, // 8 x f32
131 v16f32 = 76, // 16 x f32
132 v32f32 = 77, // 32 x f32
133 v64f32 = 78, // 64 x f32
134 v128f32 = 79, // 128 x f32
135 v256f32 = 80, // 256 x f32
136 v512f32 = 81, // 512 x f32
137 v1024f32 = 82, // 1024 x f32
138 v2048f32 = 83, // 2048 x f32
139 v1f64 = 84, // 1 x f64
140 v2f64 = 85, // 2 x f64
141 v4f64 = 86, // 4 x f64
142 v8f64 = 87, // 8 x f64
143
144 FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
145 LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
146
147 FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
148 LAST_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
149
150 nxv1i1 = 88, // n x 1 x i1
151 nxv2i1 = 89, // n x 2 x i1
152 nxv4i1 = 90, // n x 4 x i1
153 nxv8i1 = 91, // n x 8 x i1
154 nxv16i1 = 92, // n x 16 x i1
155 nxv32i1 = 93, // n x 32 x i1
156
157 nxv1i8 = 94, // n x 1 x i8
158 nxv2i8 = 95, // n x 2 x i8
159 nxv4i8 = 96, // n x 4 x i8
160 nxv8i8 = 97, // n x 8 x i8
161 nxv16i8 = 98, // n x 16 x i8
162 nxv32i8 = 99, // n x 32 x i8
163
164 nxv1i16 = 100, // n x 1 x i16
165 nxv2i16 = 101, // n x 2 x i16
166 nxv4i16 = 102, // n x 4 x i16
167 nxv8i16 = 103, // n x 8 x i16
168 nxv16i16 = 104, // n x 16 x i16
169 nxv32i16 = 105, // n x 32 x i16
170
171 nxv1i32 = 106, // n x 1 x i32
172 nxv2i32 = 107, // n x 2 x i32
173 nxv4i32 = 108, // n x 4 x i32
174 nxv8i32 = 109, // n x 8 x i32
175 nxv16i32 = 110, // n x 16 x i32
176 nxv32i32 = 111, // n x 32 x i32
177
178 nxv1i64 = 112, // n x 1 x i64
179 nxv2i64 = 113, // n x 2 x i64
180 nxv4i64 = 114, // n x 4 x i64
181 nxv8i64 = 115, // n x 8 x i64
182 nxv16i64 = 116, // n x 16 x i64
183 nxv32i64 = 117, // n x 32 x i64
184
185 FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
186 LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
187
188 nxv2f16 = 118, // n x 2 x f16
189 nxv4f16 = 119, // n x 4 x f16
190 nxv8f16 = 120, // n x 8 x f16
191 nxv1f32 = 121, // n x 1 x f32
192 nxv2f32 = 122, // n x 2 x f32
193 nxv4f32 = 123, // n x 4 x f32
194 nxv8f32 = 124, // n x 8 x f32
195 nxv16f32 = 125, // n x 16 x f32
196 nxv1f64 = 126, // n x 1 x f64
197 nxv2f64 = 127, // n x 2 x f64
198 nxv4f64 = 128, // n x 4 x f64
199 nxv8f64 = 129, // n x 8 x f64
200
201 FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv2f16,
202 LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
203
204 FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
205 LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
206
207 FIRST_VECTOR_VALUETYPE = v1i1,
208 LAST_VECTOR_VALUETYPE = nxv8f64,
209
210 x86mmx = 130, // This is an X86 MMX value
211
212 Glue = 131, // This glues nodes together during pre-RA sched
213
214 isVoid = 132, // This has no value
215
216 Untyped = 133, // This value takes a register, but has
217 // unspecified type. The register class
218 // will be determined by the opcode.
219
220 exnref = 134, // WebAssembly's exnref type
221
222 FIRST_VALUETYPE = 1, // This is always the beginning of the list.
223 LAST_VALUETYPE = 135, // This always remains at the end of the list.
224
225 // This is the current maximum for LAST_VALUETYPE.
226 // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
227 // This value must be a multiple of 32.
228 MAX_ALLOWED_VALUETYPE = 160,
229
230 // A value of type llvm::TokenTy
231 token = 248,
232
233 // This is MDNode or MDString.
234 Metadata = 249,
235
236 // An int value the size of the pointer of the current
237 // target to any address space. This must only be used internal to
238 // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
239 iPTRAny = 250,
240
241 // A vector with any length and element size. This is used
242 // for intrinsics that have overloadings based on vector types.
243 // This is only for tblgen's consumption!
244 vAny = 251,
245
246 // Any floating-point or vector floating-point value. This is used
247 // for intrinsics that have overloadings based on floating-point types.
248 // This is only for tblgen's consumption!
249 fAny = 252,
250
251 // An integer or vector integer value of any bit width. This is
252 // used for intrinsics that have overloadings based on integer bit widths.
253 // This is only for tblgen's consumption!
254 iAny = 253,
255
256 // An int value the size of the pointer of the current
257 // target. This should only be used internal to tblgen!
258 iPTR = 254,
259
260 // Any type. This is used for intrinsics that have overloadings.
261 // This is only for tblgen's consumption!
262 Any = 255
263 };
264
265 SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
266
267 constexpr MVT() = default;
268 constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
269
270 bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; }
271 bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; }
272 bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; }
273 bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; }
274 bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
275 bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
276
277 /// Return true if this is a valid simple valuetype.
278 bool isValid() const {
279 return (SimpleTy >= MVT::FIRST_VALUETYPE &&
280 SimpleTy < MVT::LAST_VALUETYPE);
281 }
282
283 /// Return true if this is a FP or a vector FP type.
284 bool isFloatingPoint() const {
285 return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
286 SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
287 (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
288 SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
289 (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
290 SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
291 }
292
293 /// Return true if this is an integer or a vector integer type.
294 bool isInteger() const {
295 return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
296 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
297 (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
298 SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
299 (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
300 SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
301 }
302
303 /// Return true if this is an integer, not including vectors.
304 bool isScalarInteger() const {
305 return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
306 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
307 }
308
309 /// Return true if this is a vector value type.
310 bool isVector() const {
311 return (SimpleTy
23.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
23.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
23.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
23.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
>= MVT::FIRST_VECTOR_VALUETYPE
&&
9
Assuming field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
11
Returning the value 1, which participates in a condition later
24
Returning the value 1, which participates in a condition later
312 SimpleTy
23.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
23.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
23.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
23.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
<= MVT::LAST_VECTOR_VALUETYPE
)
;
10
Assuming field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
313 }
314
315 /// Return true if this is a vector value type where the
316 /// runtime length is machine dependent
317 bool isScalableVector() const {
318 return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
319 SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
320 }
321
322 bool isFixedLengthVector() const {
323 return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
324 SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
325 }
326
327 /// Return true if this is a 16-bit vector type.
328 bool is16BitVector() const {
329 return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 ||
330 SimpleTy == MVT::v16i1);
331 }
332
333 /// Return true if this is a 32-bit vector type.
334 bool is32BitVector() const {
335 return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 ||
336 SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
337 SimpleTy == MVT::v2f16 || SimpleTy == MVT::v1f32);
338 }
339
340 /// Return true if this is a 64-bit vector type.
341 bool is64BitVector() const {
342 return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 ||
343 SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
344 SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
345 SimpleTy == MVT::v2f32 || SimpleTy == MVT::v1f64);
346 }
347
348 /// Return true if this is a 128-bit vector type.
349 bool is128BitVector() const {
350 return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 ||
351 SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 ||
352 SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 ||
353 SimpleTy == MVT::v8f16 || SimpleTy == MVT::v4f32 ||
354 SimpleTy == MVT::v2f64);
355 }
356
357 /// Return true if this is a 256-bit vector type.
358 bool is256BitVector() const {
359 return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v8f32 ||
360 SimpleTy == MVT::v4f64 || SimpleTy == MVT::v32i8 ||
361 SimpleTy == MVT::v16i16 || SimpleTy == MVT::v8i32 ||
362 SimpleTy == MVT::v4i64 || SimpleTy == MVT::v256i1);
363 }
364
365 /// Return true if this is a 512-bit vector type.
366 bool is512BitVector() const {
367 return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v16f32 ||
368 SimpleTy == MVT::v8f64 || SimpleTy == MVT::v512i1 ||
369 SimpleTy == MVT::v64i8 || SimpleTy == MVT::v32i16 ||
370 SimpleTy == MVT::v16i32 || SimpleTy == MVT::v8i64);
371 }
372
373 /// Return true if this is a 1024-bit vector type.
374 bool is1024BitVector() const {
375 return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
376 SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 ||
377 SimpleTy == MVT::v16i64);
378 }
379
380 /// Return true if this is a 2048-bit vector type.
381 bool is2048BitVector() const {
382 return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
383 SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64);
384 }
385
386 /// Return true if this is an overloaded type for TableGen.
387 bool isOverloaded() const {
388 return (SimpleTy==MVT::Any ||
389 SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
390 SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
391 }
392
393 /// Return a VT for a vector type with the same element type but
394 /// half the number of elements.
395 MVT getHalfNumVectorElementsVT() const {
396 MVT EltVT = getVectorElementType();
397 auto EltCnt = getVectorElementCount();
398 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 398, __PRETTY_FUNCTION__))
;
399 return getVectorVT(EltVT, EltCnt / 2);
400 }
401
402 /// Returns true if the given vector is a power of 2.
403 bool isPow2VectorType() const {
404 unsigned NElts = getVectorNumElements();
405 return !(NElts & (NElts - 1));
406 }
407
408 /// Widens the length of the given vector MVT up to the nearest power of 2
409 /// and returns that type.
410 MVT getPow2VectorType() const {
411 if (isPow2VectorType())
412 return *this;
413
414 unsigned NElts = getVectorNumElements();
415 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
416 return MVT::getVectorVT(getVectorElementType(), Pow2NElts);
417 }
418
419 /// If this is a vector, return the element type, otherwise return this.
420 MVT getScalarType() const {
421 return isVector() ? getVectorElementType() : *this;
422 }
423
424 MVT getVectorElementType() const {
425 switch (SimpleTy) {
426 default:
427 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 427)
;
428 case v1i1:
429 case v2i1:
430 case v4i1:
431 case v8i1:
432 case v16i1:
433 case v32i1:
434 case v64i1:
435 case v128i1:
436 case v256i1:
437 case v512i1:
438 case v1024i1:
439 case nxv1i1:
440 case nxv2i1:
441 case nxv4i1:
442 case nxv8i1:
443 case nxv16i1:
444 case nxv32i1: return i1;
445 case v1i8:
446 case v2i8:
447 case v4i8:
448 case v8i8:
449 case v16i8:
450 case v32i8:
451 case v64i8:
452 case v128i8:
453 case v256i8:
454 case nxv1i8:
455 case nxv2i8:
456 case nxv4i8:
457 case nxv8i8:
458 case nxv16i8:
459 case nxv32i8: return i8;
460 case v1i16:
461 case v2i16:
462 case v3i16:
463 case v4i16:
464 case v8i16:
465 case v16i16:
466 case v32i16:
467 case v64i16:
468 case v128i16:
469 case nxv1i16:
470 case nxv2i16:
471 case nxv4i16:
472 case nxv8i16:
473 case nxv16i16:
474 case nxv32i16: return i16;
475 case v1i32:
476 case v2i32:
477 case v3i32:
478 case v4i32:
479 case v5i32:
480 case v8i32:
481 case v16i32:
482 case v32i32:
483 case v64i32:
484 case v128i32:
485 case v256i32:
486 case v512i32:
487 case v1024i32:
488 case v2048i32:
489 case nxv1i32:
490 case nxv2i32:
491 case nxv4i32:
492 case nxv8i32:
493 case nxv16i32:
494 case nxv32i32: return i32;
495 case v1i64:
496 case v2i64:
497 case v4i64:
498 case v8i64:
499 case v16i64:
500 case v32i64:
501 case nxv1i64:
502 case nxv2i64:
503 case nxv4i64:
504 case nxv8i64:
505 case nxv16i64:
506 case nxv32i64: return i64;
507 case v1i128: return i128;
508 case v2f16:
509 case v3f16:
510 case v4f16:
511 case v8f16:
512 case v16f16:
513 case v32f16:
514 case nxv2f16:
515 case nxv4f16:
516 case nxv8f16: return f16;
517 case v1f32:
518 case v2f32:
519 case v3f32:
520 case v4f32:
521 case v5f32:
522 case v8f32:
523 case v16f32:
524 case v32f32:
525 case v64f32:
526 case v128f32:
527 case v256f32:
528 case v512f32:
529 case v1024f32:
530 case v2048f32:
531 case nxv1f32:
532 case nxv2f32:
533 case nxv4f32:
534 case nxv8f32:
535 case nxv16f32: return f32;
536 case v1f64:
537 case v2f64:
538 case v4f64:
539 case v8f64:
540 case nxv1f64:
541 case nxv2f64:
542 case nxv4f64:
543 case nxv8f64: return f64;
544 }
545 }
546
547 unsigned getVectorNumElements() const {
548 switch (SimpleTy) {
549 default:
550 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 550)
;
551 case v2048i32:
552 case v2048f32: return 2048;
553 case v1024i1:
554 case v1024i32:
555 case v1024f32: return 1024;
556 case v512i1:
557 case v512i32:
558 case v512f32: return 512;
559 case v256i1:
560 case v256i8:
561 case v256i32:
562 case v256f32: return 256;
563 case v128i1:
564 case v128i8:
565 case v128i16:
566 case v128i32:
567 case v128f32: return 128;
568 case v64i1:
569 case v64i8:
570 case v64i16:
571 case v64i32:
572 case v64f32: return 64;
573 case v32i1:
574 case v32i8:
575 case v32i16:
576 case v32i32:
577 case v32i64:
578 case v32f16:
579 case v32f32:
580 case nxv32i1:
581 case nxv32i8:
582 case nxv32i16:
583 case nxv32i32:
584 case nxv32i64: return 32;
585 case v16i1:
586 case v16i8:
587 case v16i16:
588 case v16i32:
589 case v16i64:
590 case v16f16:
591 case v16f32:
592 case nxv16i1:
593 case nxv16i8:
594 case nxv16i16:
595 case nxv16i32:
596 case nxv16i64:
597 case nxv16f32: return 16;
598 case v8i1:
599 case v8i8:
600 case v8i16:
601 case v8i32:
602 case v8i64:
603 case v8f16:
604 case v8f32:
605 case v8f64:
606 case nxv8i1:
607 case nxv8i8:
608 case nxv8i16:
609 case nxv8i32:
610 case nxv8i64:
611 case nxv8f16:
612 case nxv8f32:
613 case nxv8f64: return 8;
614 case v5i32:
615 case v5f32: return 5;
616 case v4i1:
617 case v4i8:
618 case v4i16:
619 case v4i32:
620 case v4i64:
621 case v4f16:
622 case v4f32:
623 case v4f64:
624 case nxv4i1:
625 case nxv4i8:
626 case nxv4i16:
627 case nxv4i32:
628 case nxv4i64:
629 case nxv4f16:
630 case nxv4f32:
631 case nxv4f64: return 4;
632 case v3i16:
633 case v3i32:
634 case v3f16:
635 case v3f32: return 3;
636 case v2i1:
637 case v2i8:
638 case v2i16:
639 case v2i32:
640 case v2i64:
641 case v2f16:
642 case v2f32:
643 case v2f64:
644 case nxv2i1:
645 case nxv2i8:
646 case nxv2i16:
647 case nxv2i32:
648 case nxv2i64:
649 case nxv2f16:
650 case nxv2f32:
651 case nxv2f64: return 2;
652 case v1i1:
653 case v1i8:
654 case v1i16:
655 case v1i32:
656 case v1i64:
657 case v1i128:
658 case v1f32:
659 case v1f64:
660 case nxv1i1:
661 case nxv1i8:
662 case nxv1i16:
663 case nxv1i32:
664 case nxv1i64:
665 case nxv1f32:
666 case nxv1f64: return 1;
667 }
668 }
669
670 ElementCount getVectorElementCount() const {
671 return { getVectorNumElements(), isScalableVector() };
672 }
673
674 unsigned getSizeInBits() const {
675 switch (SimpleTy) {
676 default:
677 llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 677)
;
678 case Other:
679 llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 679)
;
680 case iPTR:
681 llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 681)
;
682 case iPTRAny:
683 case iAny:
684 case fAny:
685 case vAny:
686 case Any:
687 llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded."
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 687)
;
688 case token:
689 llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 690)
690 "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 690)
;
691 case Metadata:
692 llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 692)
;
693 case i1:
694 case v1i1:
695 case nxv1i1: return 1;
696 case v2i1:
697 case nxv2i1: return 2;
698 case v4i1:
699 case nxv4i1: return 4;
700 case i8 :
701 case v1i8:
702 case v8i1:
703 case nxv1i8:
704 case nxv8i1: return 8;
705 case i16 :
706 case f16:
707 case v16i1:
708 case v2i8:
709 case v1i16:
710 case nxv16i1:
711 case nxv2i8:
712 case nxv1i16: return 16;
713 case f32 :
714 case i32 :
715 case v32i1:
716 case v4i8:
717 case v2i16:
718 case v2f16:
719 case v1f32:
720 case v1i32:
721 case nxv32i1:
722 case nxv4i8:
723 case nxv2i16:
724 case nxv1i32:
725 case nxv2f16:
726 case nxv1f32: return 32;
727 case v3i16:
728 case v3f16: return 48;
729 case x86mmx:
730 case f64 :
731 case i64 :
732 case v64i1:
733 case v8i8:
734 case v4i16:
735 case v2i32:
736 case v1i64:
737 case v4f16:
738 case v2f32:
739 case v1f64:
740 case nxv8i8:
741 case nxv4i16:
742 case nxv2i32:
743 case nxv1i64:
744 case nxv4f16:
745 case nxv2f32:
746 case nxv1f64: return 64;
747 case f80 : return 80;
748 case v3i32:
749 case v3f32: return 96;
750 case f128:
751 case ppcf128:
752 case i128:
753 case v128i1:
754 case v16i8:
755 case v8i16:
756 case v4i32:
757 case v2i64:
758 case v1i128:
759 case v8f16:
760 case v4f32:
761 case v2f64:
762 case nxv16i8:
763 case nxv8i16:
764 case nxv4i32:
765 case nxv2i64:
766 case nxv8f16:
767 case nxv4f32:
768 case nxv2f64: return 128;
769 case v5i32:
770 case v5f32: return 160;
771 case v256i1:
772 case v32i8:
773 case v16i16:
774 case v8i32:
775 case v4i64:
776 case v16f16:
777 case v8f32:
778 case v4f64:
779 case nxv32i8:
780 case nxv16i16:
781 case nxv8i32:
782 case nxv4i64:
783 case nxv8f32:
784 case nxv4f64: return 256;
785 case v512i1:
786 case v64i8:
787 case v32i16:
788 case v16i32:
789 case v8i64:
790 case v32f16:
791 case v16f32:
792 case v8f64:
793 case nxv32i16:
794 case nxv16i32:
795 case nxv8i64:
796 case nxv16f32:
797 case nxv8f64: return 512;
798 case v1024i1:
799 case v128i8:
800 case v64i16:
801 case v32i32:
802 case v16i64:
803 case v32f32:
804 case nxv32i32:
805 case nxv16i64: return 1024;
806 case v256i8:
807 case v128i16:
808 case v64i32:
809 case v32i64:
810 case v64f32:
811 case nxv32i64: return 2048;
812 case v128i32:
813 case v128f32: return 4096;
814 case v256i32:
815 case v256f32: return 8192;
816 case v512i32:
817 case v512f32: return 16384;
818 case v1024i32:
819 case v1024f32: return 32768;
820 case v2048i32:
821 case v2048f32: return 65536;
822 case exnref: return 0; // opaque type
823 }
824 }
825
826 unsigned getScalarSizeInBits() const {
827 return getScalarType().getSizeInBits();
828 }
829
830 /// Return the number of bytes overwritten by a store of the specified value
831 /// type.
832 unsigned getStoreSize() const {
833 return (getSizeInBits() + 7) / 8;
834 }
835
836 /// Return the number of bits overwritten by a store of the specified value
837 /// type.
838 unsigned getStoreSizeInBits() const {
839 return getStoreSize() * 8;
840 }
841
842 /// Return true if this has more bits than VT.
843 bool bitsGT(MVT VT) const {
844 return getSizeInBits() > VT.getSizeInBits();
845 }
846
847 /// Return true if this has no less bits than VT.
848 bool bitsGE(MVT VT) const {
849 return getSizeInBits() >= VT.getSizeInBits();
850 }
851
852 /// Return true if this has less bits than VT.
853 bool bitsLT(MVT VT) const {
854 return getSizeInBits() < VT.getSizeInBits();
855 }
856
857 /// Return true if this has no more bits than VT.
858 bool bitsLE(MVT VT) const {
859 return getSizeInBits() <= VT.getSizeInBits();
860 }
861
862 static MVT getFloatingPointVT(unsigned BitWidth) {
863 switch (BitWidth) {
864 default:
865 llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 865)
;
866 case 16:
867 return MVT::f16;
868 case 32:
869 return MVT::f32;
870 case 64:
871 return MVT::f64;
872 case 80:
873 return MVT::f80;
874 case 128:
875 return MVT::f128;
876 }
877 }
878
879 static MVT getIntegerVT(unsigned BitWidth) {
880 switch (BitWidth) {
881 default:
882 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
883 case 1:
884 return MVT::i1;
885 case 8:
886 return MVT::i8;
887 case 16:
888 return MVT::i16;
889 case 32:
890 return MVT::i32;
891 case 64:
892 return MVT::i64;
893 case 128:
894 return MVT::i128;
895 }
896 }
897
898 static MVT getVectorVT(MVT VT, unsigned NumElements) {
899 switch (VT.SimpleTy) {
900 default:
901 break;
902 case MVT::i1:
903 if (NumElements == 1) return MVT::v1i1;
904 if (NumElements == 2) return MVT::v2i1;
905 if (NumElements == 4) return MVT::v4i1;
906 if (NumElements == 8) return MVT::v8i1;
907 if (NumElements == 16) return MVT::v16i1;
908 if (NumElements == 32) return MVT::v32i1;
909 if (NumElements == 64) return MVT::v64i1;
910 if (NumElements == 128) return MVT::v128i1;
911 if (NumElements == 256) return MVT::v256i1;
912 if (NumElements == 512) return MVT::v512i1;
913 if (NumElements == 1024) return MVT::v1024i1;
914 break;
915 case MVT::i8:
916 if (NumElements == 1) return MVT::v1i8;
917 if (NumElements == 2) return MVT::v2i8;
918 if (NumElements == 4) return MVT::v4i8;
919 if (NumElements == 8) return MVT::v8i8;
920 if (NumElements == 16) return MVT::v16i8;
921 if (NumElements == 32) return MVT::v32i8;
922 if (NumElements == 64) return MVT::v64i8;
923 if (NumElements == 128) return MVT::v128i8;
924 if (NumElements == 256) return MVT::v256i8;
925 break;
926 case MVT::i16:
927 if (NumElements == 1) return MVT::v1i16;
928 if (NumElements == 2) return MVT::v2i16;
929 if (NumElements == 3) return MVT::v3i16;
930 if (NumElements == 4) return MVT::v4i16;
931 if (NumElements == 8) return MVT::v8i16;
932 if (NumElements == 16) return MVT::v16i16;
933 if (NumElements == 32) return MVT::v32i16;
934 if (NumElements == 64) return MVT::v64i16;
935 if (NumElements == 128) return MVT::v128i16;
936 break;
937 case MVT::i32:
938 if (NumElements == 1) return MVT::v1i32;
939 if (NumElements == 2) return MVT::v2i32;
940 if (NumElements == 3) return MVT::v3i32;
941 if (NumElements == 4) return MVT::v4i32;
942 if (NumElements == 5) return MVT::v5i32;
943 if (NumElements == 8) return MVT::v8i32;
944 if (NumElements == 16) return MVT::v16i32;
945 if (NumElements == 32) return MVT::v32i32;
946 if (NumElements == 64) return MVT::v64i32;
947 if (NumElements == 128) return MVT::v128i32;
948 if (NumElements == 256) return MVT::v256i32;
949 if (NumElements == 512) return MVT::v512i32;
950 if (NumElements == 1024) return MVT::v1024i32;
951 if (NumElements == 2048) return MVT::v2048i32;
952 break;
953 case MVT::i64:
954 if (NumElements == 1) return MVT::v1i64;
955 if (NumElements == 2) return MVT::v2i64;
956 if (NumElements == 4) return MVT::v4i64;
957 if (NumElements == 8) return MVT::v8i64;
958 if (NumElements == 16) return MVT::v16i64;
959 if (NumElements == 32) return MVT::v32i64;
960 break;
961 case MVT::i128:
962 if (NumElements == 1) return MVT::v1i128;
963 break;
964 case MVT::f16:
965 if (NumElements == 2) return MVT::v2f16;
966 if (NumElements == 3) return MVT::v3f16;
967 if (NumElements == 4) return MVT::v4f16;
968 if (NumElements == 8) return MVT::v8f16;
969 if (NumElements == 16) return MVT::v16f16;
970 if (NumElements == 32) return MVT::v32f16;
971 break;
972 case MVT::f32:
973 if (NumElements == 1) return MVT::v1f32;
974 if (NumElements == 2) return MVT::v2f32;
975 if (NumElements == 3) return MVT::v3f32;
976 if (NumElements == 4) return MVT::v4f32;
977 if (NumElements == 5) return MVT::v5f32;
978 if (NumElements == 8) return MVT::v8f32;
979 if (NumElements == 16) return MVT::v16f32;
980 if (NumElements == 32) return MVT::v32f32;
981 if (NumElements == 64) return MVT::v64f32;
982 if (NumElements == 128) return MVT::v128f32;
983 if (NumElements == 256) return MVT::v256f32;
984 if (NumElements == 512) return MVT::v512f32;
985 if (NumElements == 1024) return MVT::v1024f32;
986 if (NumElements == 2048) return MVT::v2048f32;
987 break;
988 case MVT::f64:
989 if (NumElements == 1) return MVT::v1f64;
990 if (NumElements == 2) return MVT::v2f64;
991 if (NumElements == 4) return MVT::v4f64;
992 if (NumElements == 8) return MVT::v8f64;
993 break;
994 }
995 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
996 }
997
998 static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
999 switch(VT.SimpleTy) {
1000 default:
1001 break;
1002 case MVT::i1:
1003 if (NumElements == 1) return MVT::nxv1i1;
1004 if (NumElements == 2) return MVT::nxv2i1;
1005 if (NumElements == 4) return MVT::nxv4i1;
1006 if (NumElements == 8) return MVT::nxv8i1;
1007 if (NumElements == 16) return MVT::nxv16i1;
1008 if (NumElements == 32) return MVT::nxv32i1;
1009 break;
1010 case MVT::i8:
1011 if (NumElements == 1) return MVT::nxv1i8;
1012 if (NumElements == 2) return MVT::nxv2i8;
1013 if (NumElements == 4) return MVT::nxv4i8;
1014 if (NumElements == 8) return MVT::nxv8i8;
1015 if (NumElements == 16) return MVT::nxv16i8;
1016 if (NumElements == 32) return MVT::nxv32i8;
1017 break;
1018 case MVT::i16:
1019 if (NumElements == 1) return MVT::nxv1i16;
1020 if (NumElements == 2) return MVT::nxv2i16;
1021 if (NumElements == 4) return MVT::nxv4i16;
1022 if (NumElements == 8) return MVT::nxv8i16;
1023 if (NumElements == 16) return MVT::nxv16i16;
1024 if (NumElements == 32) return MVT::nxv32i16;
1025 break;
1026 case MVT::i32:
1027 if (NumElements == 1) return MVT::nxv1i32;
1028 if (NumElements == 2) return MVT::nxv2i32;
1029 if (NumElements == 4) return MVT::nxv4i32;
1030 if (NumElements == 8) return MVT::nxv8i32;
1031 if (NumElements == 16) return MVT::nxv16i32;
1032 if (NumElements == 32) return MVT::nxv32i32;
1033 break;
1034 case MVT::i64:
1035 if (NumElements == 1) return MVT::nxv1i64;
1036 if (NumElements == 2) return MVT::nxv2i64;
1037 if (NumElements == 4) return MVT::nxv4i64;
1038 if (NumElements == 8) return MVT::nxv8i64;
1039 if (NumElements == 16) return MVT::nxv16i64;
1040 if (NumElements == 32) return MVT::nxv32i64;
1041 break;
1042 case MVT::f16:
1043 if (NumElements == 2) return MVT::nxv2f16;
1044 if (NumElements == 4) return MVT::nxv4f16;
1045 if (NumElements == 8) return MVT::nxv8f16;
1046 break;
1047 case MVT::f32:
1048 if (NumElements == 1) return MVT::nxv1f32;
1049 if (NumElements == 2) return MVT::nxv2f32;
1050 if (NumElements == 4) return MVT::nxv4f32;
1051 if (NumElements == 8) return MVT::nxv8f32;
1052 if (NumElements == 16) return MVT::nxv16f32;
1053 break;
1054 case MVT::f64:
1055 if (NumElements == 1) return MVT::nxv1f64;
1056 if (NumElements == 2) return MVT::nxv2f64;
1057 if (NumElements == 4) return MVT::nxv4f64;
1058 if (NumElements == 8) return MVT::nxv8f64;
1059 break;
1060 }
1061 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1062 }
1063
1064 static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) {
1065 if (IsScalable)
1066 return getScalableVectorVT(VT, NumElements);
1067 return getVectorVT(VT, NumElements);
1068 }
1069
1070 static MVT getVectorVT(MVT VT, ElementCount EC) {
1071 if (EC.Scalable)
1072 return getScalableVectorVT(VT, EC.Min);
1073 return getVectorVT(VT, EC.Min);
1074 }
1075
1076 /// Return the value type corresponding to the specified type. This returns
1077 /// all pointers as iPTR. If HandleUnknown is true, unknown types are
1078 /// returned as Other, otherwise they are invalid.
1079 static MVT getVT(Type *Ty, bool HandleUnknown = false);
1080
1081 private:
1082 /// A simple iterator over the MVT::SimpleValueType enum.
1083 struct mvt_iterator {
1084 SimpleValueType VT;
1085
1086 mvt_iterator(SimpleValueType VT) : VT(VT) {}
1087
1088 MVT operator*() const { return VT; }
1089 bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
1090
1091 mvt_iterator& operator++() {
1092 VT = (MVT::SimpleValueType)((int)VT + 1);
1093 assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 1094, __PRETTY_FUNCTION__))
1094 "MVT iterator overflowed.")(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MachineValueType.h"
, 1094, __PRETTY_FUNCTION__))
;
1095 return *this;
1096 }
1097 };
1098
1099 /// A range of the MVT::SimpleValueType enum.
1100 using mvt_range = iterator_range<mvt_iterator>;
1101
1102 public:
1103 /// SimpleValueType Iteration
1104 /// @{
1105 static mvt_range all_valuetypes() {
1106 return mvt_range(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
1107 }
1108
1109 static mvt_range integer_valuetypes() {
1110 return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
1111 (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
1112 }
1113
1114 static mvt_range fp_valuetypes() {
1115 return mvt_range(MVT::FIRST_FP_VALUETYPE,
1116 (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
1117 }
1118
1119 static mvt_range vector_valuetypes() {
1120 return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
1121 (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
1122 }
1123
1124 static mvt_range fixedlen_vector_valuetypes() {
1125 return mvt_range(
1126 MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
1127 (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
1128 }
1129
1130 static mvt_range scalable_vector_valuetypes() {
1131 return mvt_range(
1132 MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
1133 (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
1134 }
1135
1136 static mvt_range integer_fixedlen_vector_valuetypes() {
1137 return mvt_range(
1138 MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
1139 (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
1140 }
1141
1142 static mvt_range fp_fixedlen_vector_valuetypes() {
1143 return mvt_range(
1144 MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
1145 (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
1146 }
1147
1148 static mvt_range integer_scalable_vector_valuetypes() {
1149 return mvt_range(
1150 MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
1151 (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
1152 }
1153
1154 static mvt_range fp_scalable_vector_valuetypes() {
1155 return mvt_range(
1156 MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
1157 (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
1158 }
1159 /// @}
1160 };
1161
1162} // end namespace llvm
1163
1164#endif // LLVM_CODEGEN_MACHINEVALUETYPE_H

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h

1//===- CodeGen/ValueTypes.h - Low-Level Target independ. types --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of low-level target independent types which various
10// values in the code generator are. This allows the target specific behavior
11// of instructions to be described to target independent passes.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CODEGEN_VALUETYPES_H
16#define LLVM_CODEGEN_VALUETYPES_H
17
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MachineValueType.h"
20#include "llvm/Support/MathExtras.h"
21#include <cassert>
22#include <cstdint>
23#include <string>
24
25namespace llvm {
26
27 class LLVMContext;
28 class Type;
29
30 /// Extended Value Type. Capable of holding value types which are not native
31 /// for any processor (such as the i12345 type), as well as the types an MVT
32 /// can represent.
33 struct EVT {
34 private:
35 MVT V = MVT::INVALID_SIMPLE_VALUE_TYPE;
36 Type *LLVMTy = nullptr;
37
38 public:
39 constexpr EVT() = default;
40 constexpr EVT(MVT::SimpleValueType SVT) : V(SVT) {}
41 constexpr EVT(MVT S) : V(S) {}
42
43 bool operator==(EVT VT) const {
44 return !(*this != VT);
45 }
46 bool operator!=(EVT VT) const {
47 if (V.SimpleTy != VT.V.SimpleTy)
48 return true;
49 if (V.SimpleTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
50 return LLVMTy != VT.LLVMTy;
51 return false;
52 }
53
54 /// Returns the EVT that represents a floating-point type with the given
55 /// number of bits. There are two floating-point types with 128 bits - this
56 /// returns f128 rather than ppcf128.
57 static EVT getFloatingPointVT(unsigned BitWidth) {
58 return MVT::getFloatingPointVT(BitWidth);
59 }
60
61 /// Returns the EVT that represents an integer with the given number of
62 /// bits.
63 static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth) {
64 MVT M = MVT::getIntegerVT(BitWidth);
65 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
66 return M;
67 return getExtendedIntegerVT(Context, BitWidth);
68 }
69
70 /// Returns the EVT that represents a vector NumElements in length, where
71 /// each element is of type VT.
72 static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
73 bool IsScalable = false) {
74 MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable);
75 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
76 return M;
77
78 assert(!IsScalable && "We don't support extended scalable types yet")((!IsScalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!IsScalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 78, __PRETTY_FUNCTION__))
;
79 return getExtendedVectorVT(Context, VT, NumElements);
80 }
81
82 /// Returns the EVT that represents a vector EC.Min elements in length,
83 /// where each element is of type VT.
84 static EVT getVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
85 MVT M = MVT::getVectorVT(VT.V, EC);
86 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
87 return M;
88 assert (!EC.Scalable && "We don't support extended scalable types yet")((!EC.Scalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!EC.Scalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 88, __PRETTY_FUNCTION__))
;
89 return getExtendedVectorVT(Context, VT, EC.Min);
90 }
91
92 /// Return a vector with the same number of elements as this vector, but
93 /// with the element type converted to an integer type with the same
94 /// bitwidth.
95 EVT changeVectorElementTypeToInteger() const {
96 if (!isSimple()) {
97 assert (!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 98, __PRETTY_FUNCTION__))
98 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 98, __PRETTY_FUNCTION__))
;
99 return changeExtendedVectorElementTypeToInteger();
100 }
101 MVT EltTy = getSimpleVT().getVectorElementType();
102 unsigned BitWidth = EltTy.getSizeInBits();
103 MVT IntTy = MVT::getIntegerVT(BitWidth);
104 MVT VecTy = MVT::getVectorVT(IntTy, getVectorNumElements(),
105 isScalableVector());
106 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 107, __PRETTY_FUNCTION__))
107 "Simple vector VT not representable by simple integer vector VT!")((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 107, __PRETTY_FUNCTION__))
;
108 return VecTy;
109 }
110
111 /// Return the type converted to an equivalently sized integer or vector
112 /// with integer element type. Similar to changeVectorElementTypeToInteger,
113 /// but also handles scalars.
114 EVT changeTypeToInteger() {
115 if (isVector())
116 return changeVectorElementTypeToInteger();
117
118 if (isSimple())
119 return MVT::getIntegerVT(getSizeInBits());
120
121 return changeExtendedTypeToInteger();
122 }
123
124 /// Test if the given EVT is simple (as opposed to being extended).
125 bool isSimple() const {
126 return V.SimpleTy
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
!= MVT::INVALID_SIMPLE_VALUE_TYPE
;
19
Returning the value 1, which participates in a condition later
127 }
128
129 /// Test if the given EVT is extended (as opposed to being simple).
130 bool isExtended() const {
131 return !isSimple();
132 }
133
134 /// Return true if this is a FP or a vector FP type.
135 bool isFloatingPoint() const {
136 return isSimple() ? V.isFloatingPoint() : isExtendedFloatingPoint();
137 }
138
139 /// Return true if this is an integer or a vector integer type.
140 bool isInteger() const {
141 return isSimple() ? V.isInteger() : isExtendedInteger();
142 }
143
144 /// Return true if this is an integer, but not a vector.
145 bool isScalarInteger() const {
146 return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger();
147 }
148
149 /// Return true if this is a vector value type.
150 bool isVector() const {
151 return isSimple() ? V.isVector() : isExtendedVector();
22
'?' condition is true
23
Calling 'MVT::isVector'
25
Returning from 'MVT::isVector'
26
Returning the value 1, which participates in a condition later
152 }
153
154 /// Return true if this is a vector type where the runtime
155 /// length is machine dependent
156 bool isScalableVector() const {
157 // FIXME: We don't support extended scalable types yet, because the
158 // matching IR type doesn't exist. Once it has been added, this can
159 // be changed to call isExtendedScalableVector.
160 if (!isSimple())
161 return false;
162 return V.isScalableVector();
163 }
164
165 /// Return true if this is a 16-bit vector type.
166 bool is16BitVector() const {
167 return isSimple() ? V.is16BitVector() : isExtended16BitVector();
168 }
169
170 /// Return true if this is a 32-bit vector type.
171 bool is32BitVector() const {
172 return isSimple() ? V.is32BitVector() : isExtended32BitVector();
173 }
174
175 /// Return true if this is a 64-bit vector type.
176 bool is64BitVector() const {
177 return isSimple() ? V.is64BitVector() : isExtended64BitVector();
178 }
179
180 /// Return true if this is a 128-bit vector type.
181 bool is128BitVector() const {
182 return isSimple() ? V.is128BitVector() : isExtended128BitVector();
183 }
184
185 /// Return true if this is a 256-bit vector type.
186 bool is256BitVector() const {
187 return isSimple() ? V.is256BitVector() : isExtended256BitVector();
188 }
189
190 /// Return true if this is a 512-bit vector type.
191 bool is512BitVector() const {
192 return isSimple() ? V.is512BitVector() : isExtended512BitVector();
193 }
194
195 /// Return true if this is a 1024-bit vector type.
196 bool is1024BitVector() const {
197 return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
198 }
199
200 /// Return true if this is a 2048-bit vector type.
201 bool is2048BitVector() const {
202 return isSimple() ? V.is2048BitVector() : isExtended2048BitVector();
203 }
204
205 /// Return true if this is an overloaded type for TableGen.
206 bool isOverloaded() const {
207 return (V==MVT::iAny || V==MVT::fAny || V==MVT::vAny || V==MVT::iPTRAny);
208 }
209
210 /// Return true if the bit size is a multiple of 8.
211 bool isByteSized() const {
212 return (getSizeInBits() & 7) == 0;
213 }
214
215 /// Return true if the size is a power-of-two number of bytes.
216 bool isRound() const {
217 unsigned BitSize = getSizeInBits();
218 return BitSize >= 8 && !(BitSize & (BitSize - 1));
219 }
220
221 /// Return true if this has the same number of bits as VT.
222 bool bitsEq(EVT VT) const {
223 if (EVT::operator==(VT)) return true;
224 return getSizeInBits() == VT.getSizeInBits();
225 }
226
227 /// Return true if this has more bits than VT.
228 bool bitsGT(EVT VT) const {
229 if (EVT::operator==(VT)) return false;
230 return getSizeInBits() > VT.getSizeInBits();
231 }
232
233 /// Return true if this has no less bits than VT.
234 bool bitsGE(EVT VT) const {
235 if (EVT::operator==(VT)) return true;
236 return getSizeInBits() >= VT.getSizeInBits();
237 }
238
239 /// Return true if this has less bits than VT.
240 bool bitsLT(EVT VT) const {
241 if (EVT::operator==(VT)) return false;
242 return getSizeInBits() < VT.getSizeInBits();
243 }
244
245 /// Return true if this has no more bits than VT.
246 bool bitsLE(EVT VT) const {
247 if (EVT::operator==(VT)) return true;
248 return getSizeInBits() <= VT.getSizeInBits();
249 }
250
251 /// Return the SimpleValueType held in the specified simple EVT.
252 MVT getSimpleVT() const {
253 assert(isSimple() && "Expected a SimpleValueType!")((isSimple() && "Expected a SimpleValueType!") ? static_cast
<void> (0) : __assert_fail ("isSimple() && \"Expected a SimpleValueType!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 253, __PRETTY_FUNCTION__))
;
254 return V;
255 }
256
257 /// If this is a vector type, return the element type, otherwise return
258 /// this.
259 EVT getScalarType() const {
260 return isVector() ? getVectorElementType() : *this;
261 }
262
263 /// Given a vector type, return the type of each element.
264 EVT getVectorElementType() const {
265 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 265, __PRETTY_FUNCTION__))
;
266 if (isSimple())
267 return V.getVectorElementType();
268 return getExtendedVectorElementType();
269 }
270
271 /// Given a vector type, return the number of elements it contains.
272 unsigned getVectorNumElements() const {
273 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 273, __PRETTY_FUNCTION__))
;
274 if (isSimple())
275 return V.getVectorNumElements();
276 return getExtendedVectorNumElements();
277 }
278
279 // Given a (possibly scalable) vector type, return the ElementCount
280 ElementCount getVectorElementCount() const {
281 assert((isVector()) && "Invalid vector type!")(((isVector()) && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("(isVector()) && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 281, __PRETTY_FUNCTION__))
;
282 if (isSimple())
283 return V.getVectorElementCount();
284
285 assert(!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 286, __PRETTY_FUNCTION__))
286 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 286, __PRETTY_FUNCTION__))
;
287 return {getExtendedVectorNumElements(), false};
288 }
289
290 /// Return the size of the specified value type in bits.
291 unsigned getSizeInBits() const {
292 if (isSimple())
293 return V.getSizeInBits();
294 return getExtendedSizeInBits();
295 }
296
297 unsigned getScalarSizeInBits() const {
298 return getScalarType().getSizeInBits();
299 }
300
301 /// Return the number of bytes overwritten by a store of the specified value
302 /// type.
303 unsigned getStoreSize() const {
304 return (getSizeInBits() + 7) / 8;
305 }
306
307 /// Return the number of bits overwritten by a store of the specified value
308 /// type.
309 unsigned getStoreSizeInBits() const {
310 return getStoreSize() * 8;
311 }
312
313 /// Rounds the bit-width of the given integer EVT up to the nearest power of
314 /// two (and at least to eight), and returns the integer EVT with that
315 /// number of bits.
316 EVT getRoundIntegerType(LLVMContext &Context) const {
317 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 317, __PRETTY_FUNCTION__))
;
318 unsigned BitWidth = getSizeInBits();
319 if (BitWidth <= 8)
320 return EVT(MVT::i8);
321 return getIntegerVT(Context, 1 << Log2_32_Ceil(BitWidth));
322 }
323
324 /// Finds the smallest simple value type that is greater than or equal to
325 /// half the width of this EVT. If no simple value type can be found, an
326 /// extended integer value type of half the size (rounded up) is returned.
327 EVT getHalfSizedIntegerVT(LLVMContext &Context) const {
328 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 328, __PRETTY_FUNCTION__))
;
329 unsigned EVTSize = getSizeInBits();
330 for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
331 IntVT <= MVT::LAST_INTEGER_VALUETYPE; ++IntVT) {
332 EVT HalfVT = EVT((MVT::SimpleValueType)IntVT);
333 if (HalfVT.getSizeInBits() * 2 >= EVTSize)
334 return HalfVT;
335 }
336 return getIntegerVT(Context, (EVTSize + 1) / 2);
337 }
338
339 /// Return a VT for an integer vector type with the size of the
340 /// elements doubled. The typed returned may be an extended type.
341 EVT widenIntegerVectorElementType(LLVMContext &Context) const {
342 EVT EltVT = getVectorElementType();
343 EltVT = EVT::getIntegerVT(Context, 2 * EltVT.getSizeInBits());
344 return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
345 }
346
347 // Return a VT for a vector type with the same element type but
348 // half the number of elements. The type returned may be an
349 // extended type.
350 EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
351 EVT EltVT = getVectorElementType();
352 auto EltCnt = getVectorElementCount();
353 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/CodeGen/ValueTypes.h"
, 353, __PRETTY_FUNCTION__))
;
354 return EVT::getVectorVT(Context, EltVT, EltCnt / 2);
355 }
356
357 /// Returns true if the given vector is a power of 2.
358 bool isPow2VectorType() const {
359 unsigned NElts = getVectorNumElements();
360 return !(NElts & (NElts - 1));
361 }
362
363 /// Widens the length of the given vector EVT up to the nearest power of 2
364 /// and returns that type.
365 EVT getPow2VectorType(LLVMContext &Context) const {
366 if (!isPow2VectorType()) {
367 unsigned NElts = getVectorNumElements();
368 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
369 return EVT::getVectorVT(Context, getVectorElementType(), Pow2NElts,
370 isScalableVector());
371 }
372 else {
373 return *this;
374 }
375 }
376
377 /// This function returns value type as a string, e.g. "i32".
378 std::string getEVTString() const;
379
380 /// This method returns an LLVM type corresponding to the specified EVT.
381 /// For integer types, this returns an unsigned type. Note that this will
382 /// abort for types that cannot be represented.
383 Type *getTypeForEVT(LLVMContext &Context) const;
384
385 /// Return the value type corresponding to the specified type.
386 /// This returns all pointers as iPTR. If HandleUnknown is true, unknown
387 /// types are returned as Other, otherwise they are invalid.
388 static EVT getEVT(Type *Ty, bool HandleUnknown = false);
389
390 intptr_t getRawBits() const {
391 if (isSimple())
392 return V.SimpleTy;
393 else
394 return (intptr_t)(LLVMTy);
395 }
396
397 /// A meaningless but well-behaved order, useful for constructing
398 /// containers.
399 struct compareRawBits {
400 bool operator()(EVT L, EVT R) const {
401 if (L.V.SimpleTy == R.V.SimpleTy)
402 return L.LLVMTy < R.LLVMTy;
403 else
404 return L.V.SimpleTy < R.V.SimpleTy;
405 }
406 };
407
408 private:
409 // Methods for handling the Extended-type case in functions above.
410 // These are all out-of-line to prevent users of this header file
411 // from having a dependency on Type.h.
412 EVT changeExtendedTypeToInteger() const;
413 EVT changeExtendedVectorElementTypeToInteger() const;
414 static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth);
415 static EVT getExtendedVectorVT(LLVMContext &C, EVT VT,
416 unsigned NumElements);
417 bool isExtendedFloatingPoint() const LLVM_READONLY__attribute__((__pure__));
418 bool isExtendedInteger() const LLVM_READONLY__attribute__((__pure__));
419 bool isExtendedScalarInteger() const LLVM_READONLY__attribute__((__pure__));
420 bool isExtendedVector() const LLVM_READONLY__attribute__((__pure__));
421 bool isExtended16BitVector() const LLVM_READONLY__attribute__((__pure__));
422 bool isExtended32BitVector() const LLVM_READONLY__attribute__((__pure__));
423 bool isExtended64BitVector() const LLVM_READONLY__attribute__((__pure__));
424 bool isExtended128BitVector() const LLVM_READONLY__attribute__((__pure__));
425 bool isExtended256BitVector() const LLVM_READONLY__attribute__((__pure__));
426 bool isExtended512BitVector() const LLVM_READONLY__attribute__((__pure__));
427 bool isExtended1024BitVector() const LLVM_READONLY__attribute__((__pure__));
428 bool isExtended2048BitVector() const LLVM_READONLY__attribute__((__pure__));
429 EVT getExtendedVectorElementType() const;
430 unsigned getExtendedVectorNumElements() const LLVM_READONLY__attribute__((__pure__));
431 unsigned getExtendedSizeInBits() const LLVM_READONLY__attribute__((__pure__));
432 };
433
434} // end namespace llvm
435
436#endif // LLVM_CODEGEN_VALUETYPES_H

/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include "llvm/Support/SwapByteOrder.h"
18#include <algorithm>
19#include <cassert>
20#include <climits>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB
43.1
'ZB' is not equal to ZB_Undefined
43.1
'ZB' is not equal to ZB_Undefined
43.1
'ZB' is not equal to ZB_Undefined
43.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
44
Assuming 'Val' is equal to 0
45
Taking true branch
133 return 64;
46
Returning the value 64
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
43
Calling 'TrailingZerosCounter::count'
47
Returning from 'TrailingZerosCounter::count'
48
Returning the value 64
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 251, __PRETTY_FUNCTION__))
;
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315// NOTE: The following support functions use the _32/_64 extensions instead of
316// type overloading so that signed and unsigned integers can be used without
317// ambiguity.
318
319/// Return the high 32 bits of a 64 bit value.
320constexpr inline uint32_t Hi_32(uint64_t Value) {
321 return static_cast<uint32_t>(Value >> 32);
322}
323
324/// Return the low 32 bits of a 64 bit value.
325constexpr inline uint32_t Lo_32(uint64_t Value) {
326 return static_cast<uint32_t>(Value);
327}
328
329/// Make a 64-bit integer from a high / low pair of 32-bit integers.
330constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
331 return ((uint64_t)High << 32) | (uint64_t)Low;
332}
333
334/// Checks if an integer fits into the given bit width.
335template <unsigned N> constexpr inline bool isInt(int64_t x) {
336 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
337}
338// Template specializations to get better code for common cases.
339template <> constexpr inline bool isInt<8>(int64_t x) {
340 return static_cast<int8_t>(x) == x;
341}
342template <> constexpr inline bool isInt<16>(int64_t x) {
343 return static_cast<int16_t>(x) == x;
344}
345template <> constexpr inline bool isInt<32>(int64_t x) {
346 return static_cast<int32_t>(x) == x;
347}
348
349/// Checks if a signed integer is an N bit number shifted left by S.
350template <unsigned N, unsigned S>
351constexpr inline bool isShiftedInt(int64_t x) {
352 static_assert(
353 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
354 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
355 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
356}
357
358/// Checks if an unsigned integer fits into the given bit width.
359///
360/// This is written as two functions rather than as simply
361///
362/// return N >= 64 || X < (UINT64_C(1) << N);
363///
364/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
365/// left too many places.
366template <unsigned N>
367constexpr inline typename std::enable_if<(N < 64), bool>::type
368isUInt(uint64_t X) {
369 static_assert(N > 0, "isUInt<0> doesn't make sense");
370 return X < (UINT64_C(1)1UL << (N));
371}
372template <unsigned N>
373constexpr inline typename std::enable_if<N >= 64, bool>::type
374isUInt(uint64_t X) {
375 return true;
376}
377
378// Template specializations to get better code for common cases.
379template <> constexpr inline bool isUInt<8>(uint64_t x) {
380 return static_cast<uint8_t>(x) == x;
381}
382template <> constexpr inline bool isUInt<16>(uint64_t x) {
383 return static_cast<uint16_t>(x) == x;
384}
385template <> constexpr inline bool isUInt<32>(uint64_t x) {
386 return static_cast<uint32_t>(x) == x;
387}
388
389/// Checks if a unsigned integer is an N bit number shifted left by S.
390template <unsigned N, unsigned S>
391constexpr inline bool isShiftedUInt(uint64_t x) {
392 static_assert(
393 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
394 static_assert(N + S <= 64,
395 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
396 // Per the two static_asserts above, S must be strictly less than 64. So
397 // 1 << S is not undefined behavior.
398 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
399}
400
401/// Gets the maximum value for a N-bit unsigned integer.
402inline uint64_t maxUIntN(uint64_t N) {
403 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 403, __PRETTY_FUNCTION__))
;
404
405 // uint64_t(1) << 64 is undefined behavior, so we can't do
406 // (uint64_t(1) << N) - 1
407 // without checking first that N != 64. But this works and doesn't have a
408 // branch.
409 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
410}
411
412/// Gets the minimum value for a N-bit signed integer.
413inline int64_t minIntN(int64_t N) {
414 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 414, __PRETTY_FUNCTION__))
;
415
416 return -(UINT64_C(1)1UL<<(N-1));
417}
418
419/// Gets the maximum value for a N-bit signed integer.
420inline int64_t maxIntN(int64_t N) {
421 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 421, __PRETTY_FUNCTION__))
;
422
423 // This relies on two's complement wraparound when N == 64, so we convert to
424 // int64_t only at the very end to avoid UB.
425 return (UINT64_C(1)1UL << (N - 1)) - 1;
426}
427
428/// Checks if an unsigned integer fits into the given (dynamic) bit width.
429inline bool isUIntN(unsigned N, uint64_t x) {
430 return N >= 64 || x <= maxUIntN(N);
431}
432
433/// Checks if an signed integer fits into the given (dynamic) bit width.
434inline bool isIntN(unsigned N, int64_t x) {
435 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
436}
437
438/// Return true if the argument is a non-empty sequence of ones starting at the
439/// least significant bit with the remainder zero (32 bit version).
440/// Ex. isMask_32(0x0000FFFFU) == true.
441constexpr inline bool isMask_32(uint32_t Value) {
442 return Value && ((Value + 1) & Value) == 0;
443}
444
445/// Return true if the argument is a non-empty sequence of ones starting at the
446/// least significant bit with the remainder zero (64 bit version).
447constexpr inline bool isMask_64(uint64_t Value) {
448 return Value && ((Value + 1) & Value) == 0;
449}
450
451/// Return true if the argument contains a non-empty sequence of ones with the
452/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
453constexpr inline bool isShiftedMask_32(uint32_t Value) {
454 return Value && isMask_32((Value - 1) | Value);
455}
456
457/// Return true if the argument contains a non-empty sequence of ones with the
458/// remainder zero (64 bit version.)
459constexpr inline bool isShiftedMask_64(uint64_t Value) {
460 return Value && isMask_64((Value - 1) | Value);
461}
462
463/// Return true if the argument is a power of two > 0.
464/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
465constexpr inline bool isPowerOf2_32(uint32_t Value) {
466 return Value && !(Value & (Value - 1));
467}
468
469/// Return true if the argument is a power of two > 0 (64 bit edition.)
470constexpr inline bool isPowerOf2_64(uint64_t Value) {
471 return Value && !(Value & (Value - 1));
472}
473
474/// Return a byte-swapped representation of the 16-bit argument.
475inline uint16_t ByteSwap_16(uint16_t Value) {
476 return sys::SwapByteOrder_16(Value);
477}
478
479/// Return a byte-swapped representation of the 32-bit argument.
480inline uint32_t ByteSwap_32(uint32_t Value) {
481 return sys::SwapByteOrder_32(Value);
482}
483
484/// Return a byte-swapped representation of the 64-bit argument.
485inline uint64_t ByteSwap_64(uint64_t Value) {
486 return sys::SwapByteOrder_64(Value);
487}
488
489/// Count the number of ones from the most significant bit to the first
490/// zero bit.
491///
492/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
493/// Only unsigned integral types are allowed.
494///
495/// \param ZB the behavior on an input of all ones. Only ZB_Width and
496/// ZB_Undefined are valid arguments.
497template <typename T>
498unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
499 static_assert(std::numeric_limits<T>::is_integer &&
500 !std::numeric_limits<T>::is_signed,
501 "Only unsigned integral types are allowed.");
502 return countLeadingZeros<T>(~Value, ZB);
503}
504
505/// Count the number of ones from the least significant bit to the first
506/// zero bit.
507///
508/// Ex. countTrailingOnes(0x00FF00FF) == 8.
509/// Only unsigned integral types are allowed.
510///
511/// \param ZB the behavior on an input of all ones. Only ZB_Width and
512/// ZB_Undefined are valid arguments.
513template <typename T>
514unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
515 static_assert(std::numeric_limits<T>::is_integer &&
516 !std::numeric_limits<T>::is_signed,
517 "Only unsigned integral types are allowed.");
518 return countTrailingZeros<T>(~Value, ZB);
519}
520
521namespace detail {
522template <typename T, std::size_t SizeOfT> struct PopulationCounter {
523 static unsigned count(T Value) {
524 // Generic version, forward to 32 bits.
525 static_assert(SizeOfT <= 4, "Not implemented!");
526#if defined(__GNUC__4)
527 return __builtin_popcount(Value);
528#else
529 uint32_t v = Value;
530 v = v - ((v >> 1) & 0x55555555);
531 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
532 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
533#endif
534 }
535};
536
537template <typename T> struct PopulationCounter<T, 8> {
538 static unsigned count(T Value) {
539#if defined(__GNUC__4)
540 return __builtin_popcountll(Value);
541#else
542 uint64_t v = Value;
543 v = v - ((v >> 1) & 0x5555555555555555ULL);
544 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
545 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
546 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
547#endif
548 }
549};
550} // namespace detail
551
552/// Count the number of set bits in a value.
553/// Ex. countPopulation(0xF000F000) = 8
554/// Returns 0 if the word is zero.
555template <typename T>
556inline unsigned countPopulation(T Value) {
557 static_assert(std::numeric_limits<T>::is_integer &&
558 !std::numeric_limits<T>::is_signed,
559 "Only unsigned integral types are allowed.");
560 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
561}
562
563/// Compile time Log2.
564/// Valid only for positive powers of two.
565template <size_t kValue> constexpr inline size_t CTLog2() {
566 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
567 "Value is not a valid power of 2");
568 return 1 + CTLog2<kValue / 2>();
569}
570
571template <> constexpr inline size_t CTLog2<1>() { return 0; }
572
573/// Return the log base 2 of the specified value.
574inline double Log2(double Value) {
575#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
576 return __builtin_log(Value) / __builtin_log(2.0);
577#else
578 return log2(Value);
579#endif
580}
581
582/// Return the floor log base 2 of the specified value, -1 if the value is zero.
583/// (32 bit edition.)
584/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
585inline unsigned Log2_32(uint32_t Value) {
586 return 31 - countLeadingZeros(Value);
587}
588
589/// Return the floor log base 2 of the specified value, -1 if the value is zero.
590/// (64 bit edition.)
591inline unsigned Log2_64(uint64_t Value) {
592 return 63 - countLeadingZeros(Value);
593}
594
595/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
596/// (32 bit edition).
597/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
598inline unsigned Log2_32_Ceil(uint32_t Value) {
599 return 32 - countLeadingZeros(Value - 1);
600}
601
602/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
603/// (64 bit edition.)
604inline unsigned Log2_64_Ceil(uint64_t Value) {
605 return 64 - countLeadingZeros(Value - 1);
606}
607
608/// Return the greatest common divisor of the values using Euclid's algorithm.
609template <typename T>
610inline T greatestCommonDivisor(T A, T B) {
611 while (B) {
612 T Tmp = B;
613 B = A % B;
614 A = Tmp;
615 }
616 return A;
617}
618
619inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
620 return greatestCommonDivisor<uint64_t>(A, B);
621}
622
623/// This function takes a 64-bit integer and returns the bit equivalent double.
624inline double BitsToDouble(uint64_t Bits) {
625 double D;
626 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
627 memcpy(&D, &Bits, sizeof(Bits));
628 return D;
629}
630
631/// This function takes a 32-bit integer and returns the bit equivalent float.
632inline float BitsToFloat(uint32_t Bits) {
633 float F;
634 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
635 memcpy(&F, &Bits, sizeof(Bits));
636 return F;
637}
638
639/// This function takes a double and returns the bit equivalent 64-bit integer.
640/// Note that copying doubles around changes the bits of NaNs on some hosts,
641/// notably x86, so this routine cannot be used if these bits are needed.
642inline uint64_t DoubleToBits(double Double) {
643 uint64_t Bits;
644 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
645 memcpy(&Bits, &Double, sizeof(Double));
646 return Bits;
647}
648
649/// This function takes a float and returns the bit equivalent 32-bit integer.
650/// Note that copying floats around changes the bits of NaNs on some hosts,
651/// notably x86, so this routine cannot be used if these bits are needed.
652inline uint32_t FloatToBits(float Float) {
653 uint32_t Bits;
654 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
655 memcpy(&Bits, &Float, sizeof(Float));
656 return Bits;
657}
658
659/// A and B are either alignments or offsets. Return the minimum alignment that
660/// may be assumed after adding the two together.
661constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
662 // The largest power of 2 that divides both A and B.
663 //
664 // Replace "-Value" by "1+~Value" in the following commented code to avoid
665 // MSVC warning C4146
666 // return (A | B) & -(A | B);
667 return (A | B) & (1 + ~(A | B));
668}
669
670/// Returns the next power of two (in 64-bits) that is strictly greater than A.
671/// Returns zero on overflow.
672inline uint64_t NextPowerOf2(uint64_t A) {
673 A |= (A >> 1);
674 A |= (A >> 2);
675 A |= (A >> 4);
676 A |= (A >> 8);
677 A |= (A >> 16);
678 A |= (A >> 32);
679 return A + 1;
680}
681
682/// Returns the power of two which is less than or equal to the given value.
683/// Essentially, it is a floor operation across the domain of powers of two.
684inline uint64_t PowerOf2Floor(uint64_t A) {
685 if (!A) return 0;
686 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
687}
688
689/// Returns the power of two which is greater than or equal to the given value.
690/// Essentially, it is a ceil operation across the domain of powers of two.
691inline uint64_t PowerOf2Ceil(uint64_t A) {
692 if (!A)
693 return 0;
694 return NextPowerOf2(A - 1);
695}
696
697/// Returns the next integer (mod 2**64) that is greater than or equal to
698/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
699///
700/// If non-zero \p Skew is specified, the return value will be a minimal
701/// integer that is greater than or equal to \p Value and equal to
702/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
703/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
704///
705/// Examples:
706/// \code
707/// alignTo(5, 8) = 8
708/// alignTo(17, 8) = 24
709/// alignTo(~0LL, 8) = 0
710/// alignTo(321, 255) = 510
711///
712/// alignTo(5, 8, 7) = 7
713/// alignTo(17, 8, 1) = 17
714/// alignTo(~0LL, 8, 3) = 3
715/// alignTo(321, 255, 42) = 552
716/// \endcode
717inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
718 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 718, __PRETTY_FUNCTION__))
;
719 Skew %= Align;
720 return (Value + Align - 1 - Skew) / Align * Align + Skew;
721}
722
723/// Returns the next integer (mod 2**64) that is greater than or equal to
724/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
725template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
726 static_assert(Align != 0u, "Align must be non-zero");
727 return (Value + Align - 1) / Align * Align;
728}
729
730/// Returns the integer ceil(Numerator / Denominator).
731inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
732 return alignTo(Numerator, Denominator) / Denominator;
733}
734
735/// Returns the largest uint64_t less than or equal to \p Value and is
736/// \p Skew mod \p Align. \p Align must be non-zero
737inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
738 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 738, __PRETTY_FUNCTION__))
;
739 Skew %= Align;
740 return (Value - Skew) / Align * Align + Skew;
741}
742
743/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
744/// Requires 0 < B <= 32.
745template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
746 static_assert(B > 0, "Bit width can't be 0.");
747 static_assert(B <= 32, "Bit width out of range.");
748 return int32_t(X << (32 - B)) >> (32 - B);
749}
750
751/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
752/// Requires 0 < B < 32.
753inline int32_t SignExtend32(uint32_t X, unsigned B) {
754 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 754, __PRETTY_FUNCTION__))
;
755 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 755, __PRETTY_FUNCTION__))
;
756 return int32_t(X << (32 - B)) >> (32 - B);
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
760/// Requires 0 < B < 64.
761template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 64, "Bit width out of range.");
764 return int64_t(x << (64 - B)) >> (64 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
768/// Requires 0 < B < 64.
769inline int64_t SignExtend64(uint64_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 770, __PRETTY_FUNCTION__))
;
771 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include/llvm/Support/MathExtras.h"
, 771, __PRETTY_FUNCTION__))
;
772 return int64_t(X << (64 - B)) >> (64 - B);
773}
774
775/// Subtract two unsigned integers, X and Y, of type T and return the absolute
776/// value of the result.
777template <typename T>
778typename std::enable_if<std::is_unsigned<T>::value, T>::type
779AbsoluteDifference(T X, T Y) {
780 return std::max(X, Y) - std::min(X, Y);
781}
782
783/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
784/// maximum representable value of T on overflow. ResultOverflowed indicates if
785/// the result is larger than the maximum representable value of type T.
786template <typename T>
787typename std::enable_if<std::is_unsigned<T>::value, T>::type
788SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
789 bool Dummy;
790 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
791 // Hacker's Delight, p. 29
792 T Z = X + Y;
793 Overflowed = (Z < X || Z < Y);
794 if (Overflowed)
795 return std::numeric_limits<T>::max();
796 else
797 return Z;
798}
799
800/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
801/// maximum representable value of T on overflow. ResultOverflowed indicates if
802/// the result is larger than the maximum representable value of type T.
803template <typename T>
804typename std::enable_if<std::is_unsigned<T>::value, T>::type
805SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
806 bool Dummy;
807 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
808
809 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
810 // because it fails for uint16_t (where multiplication can have undefined
811 // behavior due to promotion to int), and requires a division in addition
812 // to the multiplication.
813
814 Overflowed = false;
815
816 // Log2(Z) would be either Log2Z or Log2Z + 1.
817 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
818 // will necessarily be less than Log2Max as desired.
819 int Log2Z = Log2_64(X) + Log2_64(Y);
820 const T Max = std::numeric_limits<T>::max();
821 int Log2Max = Log2_64(Max);
822 if (Log2Z < Log2Max) {
823 return X * Y;
824 }
825 if (Log2Z > Log2Max) {
826 Overflowed = true;
827 return Max;
828 }
829
830 // We're going to use the top bit, and maybe overflow one
831 // bit past it. Multiply all but the bottom bit then add
832 // that on at the end.
833 T Z = (X >> 1) * Y;
834 if (Z & ~(Max >> 1)) {
835 Overflowed = true;
836 return Max;
837 }
838 Z <<= 1;
839 if (X & 1)
840 return SaturatingAdd(Z, Y, ResultOverflowed);
841
842 return Z;
843}
844
845/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
846/// the product. Clamp the result to the maximum representable value of T on
847/// overflow. ResultOverflowed indicates if the result is larger than the
848/// maximum representable value of type T.
849template <typename T>
850typename std::enable_if<std::is_unsigned<T>::value, T>::type
851SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
852 bool Dummy;
853 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
854
855 T Product = SaturatingMultiply(X, Y, &Overflowed);
856 if (Overflowed)
857 return Product;
858
859 return SaturatingAdd(A, Product, &Overflowed);
860}
861
862/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
863extern const float huge_valf;
864
865
866/// Add two signed integers, computing the two's complement truncated result,
867/// returning true if overflow occured.
868template <typename T>
869typename std::enable_if<std::is_signed<T>::value, T>::type
870AddOverflow(T X, T Y, T &Result) {
871#if __has_builtin(__builtin_add_overflow)1
872 return __builtin_add_overflow(X, Y, &Result);
873#else
874 // Perform the unsigned addition.
875 using U = typename std::make_unsigned<T>::type;
876 const U UX = static_cast<U>(X);
877 const U UY = static_cast<U>(Y);
878 const U UResult = UX + UY;
879
880 // Convert to signed.
881 Result = static_cast<T>(UResult);
882
883 // Adding two positive numbers should result in a positive number.
884 if (X > 0 && Y > 0)
885 return Result <= 0;
886 // Adding two negatives should result in a negative number.
887 if (X < 0 && Y < 0)
888 return Result >= 0;
889 return false;
890#endif
891}
892
893/// Subtract two signed integers, computing the two's complement truncated
894/// result, returning true if an overflow ocurred.
895template <typename T>
896typename std::enable_if<std::is_signed<T>::value, T>::type
897SubOverflow(T X, T Y, T &Result) {
898#if __has_builtin(__builtin_sub_overflow)1
899 return __builtin_sub_overflow(X, Y, &Result);
900#else
901 // Perform the unsigned addition.
902 using U = typename std::make_unsigned<T>::type;
903 const U UX = static_cast<U>(X);
904 const U UY = static_cast<U>(Y);
905 const U UResult = UX - UY;
906
907 // Convert to signed.
908 Result = static_cast<T>(UResult);
909
910 // Subtracting a positive number from a negative results in a negative number.
911 if (X <= 0 && Y > 0)
912 return Result >= 0;
913 // Subtracting a negative number from a positive results in a positive number.
914 if (X >= 0 && Y < 0)
915 return Result <= 0;
916 return false;
917#endif
918}
919
920
921/// Multiply two signed integers, computing the two's complement truncated
922/// result, returning true if an overflow ocurred.
923template <typename T>
924typename std::enable_if<std::is_signed<T>::value, T>::type
925MulOverflow(T X, T Y, T &Result) {
926 // Perform the unsigned multiplication on absolute values.
927 using U = typename std::make_unsigned<T>::type;
928 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
929 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
930 const U UResult = UX * UY;
931
932 // Convert to signed.
933 const bool IsNegative = (X < 0) ^ (Y < 0);
934 Result = IsNegative ? (0 - UResult) : UResult;
935
936 // If any of the args was 0, result is 0 and no overflow occurs.
937 if (UX == 0 || UY == 0)
938 return false;
939
940 // UX and UY are in [1, 2^n], where n is the number of digits.
941 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
942 // positive) divided by an argument compares to the other.
943 if (IsNegative)
944 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
945 else
946 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
947}
948
949} // End llvm namespace
950
951#endif