Bug Summary

File:llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 34587, column 5
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fdenormal-fp-math=ieee,ieee -fdenormal-fp-math-f32=ieee,ieee -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-02-25-045343-43954-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "Utils/X86ShuffleDecode.h"
16#include "X86CallingConv.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86TargetMachine.h"
22#include "X86TargetObjectFile.h"
23#include "llvm/ADT/SmallBitVector.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringExtras.h"
27#include "llvm/ADT/StringSwitch.h"
28#include "llvm/Analysis/BlockFrequencyInfo.h"
29#include "llvm/Analysis/EHPersonalities.h"
30#include "llvm/Analysis/ProfileSummaryInfo.h"
31#include "llvm/Analysis/VectorUtils.h"
32#include "llvm/CodeGen/IntrinsicLowering.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineInstrBuilder.h"
36#include "llvm/CodeGen/MachineJumpTableInfo.h"
37#include "llvm/CodeGen/MachineModuleInfo.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/TargetLowering.h"
40#include "llvm/CodeGen/WinEHFuncInfo.h"
41#include "llvm/IR/CallSite.h"
42#include "llvm/IR/CallingConv.h"
43#include "llvm/IR/Constants.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Function.h"
47#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/GlobalVariable.h"
49#include "llvm/IR/Instructions.h"
50#include "llvm/IR/Intrinsics.h"
51#include "llvm/MC/MCAsmInfo.h"
52#include "llvm/MC/MCContext.h"
53#include "llvm/MC/MCExpr.h"
54#include "llvm/MC/MCSymbol.h"
55#include "llvm/Support/CommandLine.h"
56#include "llvm/Support/Debug.h"
57#include "llvm/Support/ErrorHandling.h"
58#include "llvm/Support/KnownBits.h"
59#include "llvm/Support/MathExtras.h"
60#include "llvm/Target/TargetOptions.h"
61#include <algorithm>
62#include <bitset>
63#include <cctype>
64#include <numeric>
65using namespace llvm;
66
67#define DEBUG_TYPE"x86-isel" "x86-isel"
68
69STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
70
71static cl::opt<int> ExperimentalPrefLoopAlignment(
72 "x86-experimental-pref-loop-alignment", cl::init(4),
73 cl::desc(
74 "Sets the preferable loop alignment for experiments (as log2 bytes)"
75 "(the last x86-experimental-pref-loop-alignment bits"
76 " of the loop header PC will be 0)."),
77 cl::Hidden);
78
79// Added in 10.0.
80static cl::opt<bool> EnableOldKNLABI(
81 "x86-enable-old-knl-abi", cl::init(false),
82 cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
83 "one ZMM register on AVX512F, but not AVX512BW targets."),
84 cl::Hidden);
85
86static cl::opt<bool> MulConstantOptimization(
87 "mul-constant-optimization", cl::init(true),
88 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 "SHIFT, LEA, etc."),
90 cl::Hidden);
91
92static cl::opt<bool> ExperimentalUnorderedISEL(
93 "x86-experimental-unordered-atomic-isel", cl::init(false),
94 cl::desc("Use LoadSDNode and StoreSDNode instead of "
95 "AtomicSDNode for unordered atomic loads and "
96 "stores respectively."),
97 cl::Hidden);
98
99/// Call this when the user attempts to do something unsupported, like
100/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
101/// report_fatal_error, so calling code should attempt to recover without
102/// crashing.
103static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
104 const char *Msg) {
105 MachineFunction &MF = DAG.getMachineFunction();
106 DAG.getContext()->diagnose(
107 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
108}
109
110X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
111 const X86Subtarget &STI)
112 : TargetLowering(TM), Subtarget(STI) {
113 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
114 X86ScalarSSEf64 = Subtarget.hasSSE2();
115 X86ScalarSSEf32 = Subtarget.hasSSE1();
116 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
117
118 // Set up the TargetLowering object.
119
120 // X86 is weird. It always uses i8 for shift amounts and setcc results.
121 setBooleanContents(ZeroOrOneBooleanContent);
122 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
123 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
124
125 // For 64-bit, since we have so many registers, use the ILP scheduler.
126 // For 32-bit, use the register pressure specific scheduling.
127 // For Atom, always use ILP scheduling.
128 if (Subtarget.isAtom())
129 setSchedulingPreference(Sched::ILP);
130 else if (Subtarget.is64Bit())
131 setSchedulingPreference(Sched::ILP);
132 else
133 setSchedulingPreference(Sched::RegPressure);
134 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
135 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
136
137 // Bypass expensive divides and use cheaper ones.
138 if (TM.getOptLevel() >= CodeGenOpt::Default) {
139 if (Subtarget.hasSlowDivide32())
140 addBypassSlowDiv(32, 8);
141 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
142 addBypassSlowDiv(64, 32);
143 }
144
145 if (Subtarget.isTargetWindowsMSVC() ||
146 Subtarget.isTargetWindowsItanium()) {
147 // Setup Windows compiler runtime calls.
148 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
149 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
150 setLibcallName(RTLIB::SREM_I64, "_allrem");
151 setLibcallName(RTLIB::UREM_I64, "_aullrem");
152 setLibcallName(RTLIB::MUL_I64, "_allmul");
153 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
154 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
155 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
156 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
157 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
158 }
159
160 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
161 // MSVCRT doesn't have powi; fall back to pow
162 setLibcallName(RTLIB::POWI_F32, nullptr);
163 setLibcallName(RTLIB::POWI_F64, nullptr);
164 }
165
166 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
167 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
168 // FIXME: Should we be limiting the atomic size on other configs? Default is
169 // 1024.
170 if (!Subtarget.hasCmpxchg8b())
171 setMaxAtomicSizeInBitsSupported(32);
172
173 // Set up the register classes.
174 addRegisterClass(MVT::i8, &X86::GR8RegClass);
175 addRegisterClass(MVT::i16, &X86::GR16RegClass);
176 addRegisterClass(MVT::i32, &X86::GR32RegClass);
177 if (Subtarget.is64Bit())
178 addRegisterClass(MVT::i64, &X86::GR64RegClass);
179
180 for (MVT VT : MVT::integer_valuetypes())
181 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
182
183 // We don't accept any truncstore of integer registers.
184 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
185 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
186 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
187 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
188 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
189 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
190
191 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
192
193 // SETOEQ and SETUNE require checking two conditions.
194 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
195 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
196 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
197 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
198 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
199 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
200
201 // Integer absolute.
202 if (Subtarget.hasCMov()) {
203 setOperationAction(ISD::ABS , MVT::i16 , Custom);
204 setOperationAction(ISD::ABS , MVT::i32 , Custom);
205 }
206 setOperationAction(ISD::ABS , MVT::i64 , Custom);
207
208 // Funnel shifts.
209 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
210 setOperationAction(ShiftOp , MVT::i16 , Custom);
211 setOperationAction(ShiftOp , MVT::i32 , Custom);
212 if (Subtarget.is64Bit())
213 setOperationAction(ShiftOp , MVT::i64 , Custom);
214 }
215
216 if (!Subtarget.useSoftFloat()) {
217 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
218 // operation.
219 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
220 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
221 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
222 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
223 // We have an algorithm for SSE2, and we turn this into a 64-bit
224 // FILD or VCVTUSI2SS/SD for other targets.
225 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
226 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
227 // We have an algorithm for SSE2->double, and we turn this into a
228 // 64-bit FILD followed by conditional FADD for other targets.
229 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
230 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
231
232 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
233 // this operation.
234 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
235 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
236 // SSE has no i16 to fp conversion, only i32. We promote in the handler
237 // to allow f80 to use i16 and f64 to use i16 with sse1 only
238 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
239 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
240 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
241 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
242 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
243 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
244 // are Legal, f80 is custom lowered.
245 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
246 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
247
248 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
249 // this operation.
250 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
251 // FIXME: This doesn't generate invalid exception when it should. PR44019.
252 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
253 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
254 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
255 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
256 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
257 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
258 // are Legal, f80 is custom lowered.
259 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
260 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
261
262 // Handle FP_TO_UINT by promoting the destination to a larger signed
263 // conversion.
264 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
265 // FIXME: This doesn't generate invalid exception when it should. PR44019.
266 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
267 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
268 // FIXME: This doesn't generate invalid exception when it should. PR44019.
269 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
270 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
271 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
272 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
273 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
274
275 setOperationAction(ISD::LRINT, MVT::f32, Custom);
276 setOperationAction(ISD::LRINT, MVT::f64, Custom);
277 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
278 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
279
280 if (!Subtarget.is64Bit()) {
281 setOperationAction(ISD::LRINT, MVT::i64, Custom);
282 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
283 }
284 }
285
286 // Handle address space casts between mixed sized pointers.
287 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
288 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
289
290 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
291 if (!X86ScalarSSEf64) {
292 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
293 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
294 if (Subtarget.is64Bit()) {
295 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
296 // Without SSE, i64->f64 goes through memory.
297 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 }
299 } else if (!Subtarget.is64Bit())
300 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301
302 // Scalar integer divide and remainder are lowered to use operations that
303 // produce two results, to match the available instructions. This exposes
304 // the two-result form to trivial CSE, which is able to combine x/y and x%y
305 // into a single instruction.
306 //
307 // Scalar integer multiply-high is also lowered to use two-result
308 // operations, to match the available instructions. However, plain multiply
309 // (low) operations are left as Legal, as there are single-result
310 // instructions for this in x86. Using the two-result multiply instructions
311 // when both high and low results are needed must be arranged by dagcombine.
312 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
313 setOperationAction(ISD::MULHS, VT, Expand);
314 setOperationAction(ISD::MULHU, VT, Expand);
315 setOperationAction(ISD::SDIV, VT, Expand);
316 setOperationAction(ISD::UDIV, VT, Expand);
317 setOperationAction(ISD::SREM, VT, Expand);
318 setOperationAction(ISD::UREM, VT, Expand);
319 }
320
321 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
322 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
323 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
324 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
325 setOperationAction(ISD::BR_CC, VT, Expand);
326 setOperationAction(ISD::SELECT_CC, VT, Expand);
327 }
328 if (Subtarget.is64Bit())
329 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
332 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
333
334 setOperationAction(ISD::FREM , MVT::f32 , Expand);
335 setOperationAction(ISD::FREM , MVT::f64 , Expand);
336 setOperationAction(ISD::FREM , MVT::f80 , Expand);
337 setOperationAction(ISD::FREM , MVT::f128 , Expand);
338 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
339
340 // Promote the i8 variants and force them on up to i32 which has a shorter
341 // encoding.
342 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
343 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
344 if (!Subtarget.hasBMI()) {
345 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
346 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
347 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
348 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
352 }
353 }
354
355 if (Subtarget.hasLZCNT()) {
356 // When promoting the i8 variants, force them to i32 for a shorter
357 // encoding.
358 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
359 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 } else {
361 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
362 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
363 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
365 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
366 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
367 if (Subtarget.is64Bit()) {
368 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
369 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
370 }
371 }
372
373 // Special handling for half-precision floating point conversions.
374 // If we don't have F16C support, then lower half float conversions
375 // into library calls.
376 if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
377 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom);
378 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Custom);
379 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
380 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Custom);
381 } else {
382 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
383 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
384 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
385 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
386 }
387
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
392 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
393 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f80, Expand);
394 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f128, Expand);
395 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
396 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
397 setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
398 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
399 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f80, Expand);
400 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f128, Expand);
401
402 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
403 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
404 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
405 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
406 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
407 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
408 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
409 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
410
411 if (Subtarget.hasPOPCNT()) {
412 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
413 } else {
414 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
415 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
416 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
417 if (Subtarget.is64Bit())
418 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
419 else
420 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
421 }
422
423 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
424
425 if (!Subtarget.hasMOVBE())
426 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
427
428 // X86 wants to expand cmov itself.
429 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
430 setOperationAction(ISD::SELECT, VT, Custom);
431 setOperationAction(ISD::SETCC, VT, Custom);
432 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
433 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
434 }
435 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
436 if (VT == MVT::i64 && !Subtarget.is64Bit())
437 continue;
438 setOperationAction(ISD::SELECT, VT, Custom);
439 setOperationAction(ISD::SETCC, VT, Custom);
440 }
441
442 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
443 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
444 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
445
446 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
447 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
448 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
449 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
450 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
451 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
452 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
453 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
454
455 // Darwin ABI issue.
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::ConstantPool , VT, Custom);
460 setOperationAction(ISD::JumpTable , VT, Custom);
461 setOperationAction(ISD::GlobalAddress , VT, Custom);
462 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
463 setOperationAction(ISD::ExternalSymbol , VT, Custom);
464 setOperationAction(ISD::BlockAddress , VT, Custom);
465 }
466
467 // 64-bit shl, sra, srl (iff 32-bit x86)
468 for (auto VT : { MVT::i32, MVT::i64 }) {
469 if (VT == MVT::i64 && !Subtarget.is64Bit())
470 continue;
471 setOperationAction(ISD::SHL_PARTS, VT, Custom);
472 setOperationAction(ISD::SRA_PARTS, VT, Custom);
473 setOperationAction(ISD::SRL_PARTS, VT, Custom);
474 }
475
476 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
477 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
478
479 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
480
481 // Expand certain atomics
482 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
484 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
485 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
486 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
487 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
488 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
489 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
490 }
491
492 if (!Subtarget.is64Bit())
493 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
494
495 if (Subtarget.hasCmpxchg16b()) {
496 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
497 }
498
499 // FIXME - use subtarget debug flags
500 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
501 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
502 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
503 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
504 }
505
506 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
507 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
508
509 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
510 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
511
512 setOperationAction(ISD::TRAP, MVT::Other, Legal);
513 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
514
515 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
516 setOperationAction(ISD::VASTART , MVT::Other, Custom);
517 setOperationAction(ISD::VAEND , MVT::Other, Expand);
518 bool Is64Bit = Subtarget.is64Bit();
519 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
520 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
521
522 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
523 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
524
525 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
526
527 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
528 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
529 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
530
531 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
532 // f32 and f64 use SSE.
533 // Set up the FP register classes.
534 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
535 : &X86::FR32RegClass);
536 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
537 : &X86::FR64RegClass);
538
539 // Disable f32->f64 extload as we can only generate this in one instruction
540 // under optsize. So its easier to pattern match (fpext (load)) for that
541 // case instead of needing to emit 2 instructions for extload in the
542 // non-optsize case.
543 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
544
545 for (auto VT : { MVT::f32, MVT::f64 }) {
546 // Use ANDPD to simulate FABS.
547 setOperationAction(ISD::FABS, VT, Custom);
548
549 // Use XORP to simulate FNEG.
550 setOperationAction(ISD::FNEG, VT, Custom);
551
552 // Use ANDPD and ORPD to simulate FCOPYSIGN.
553 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
554
555 // These might be better off as horizontal vector ops.
556 setOperationAction(ISD::FADD, VT, Custom);
557 setOperationAction(ISD::FSUB, VT, Custom);
558
559 // We don't support sin/cos/fmod
560 setOperationAction(ISD::FSIN , VT, Expand);
561 setOperationAction(ISD::FCOS , VT, Expand);
562 setOperationAction(ISD::FSINCOS, VT, Expand);
563 }
564
565 // Lower this to MOVMSK plus an AND.
566 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
567 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
568
569 } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
570 // Use SSE for f32, x87 for f64.
571 // Set up the FP register classes.
572 addRegisterClass(MVT::f32, &X86::FR32RegClass);
573 if (UseX87)
574 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
575
576 // Use ANDPS to simulate FABS.
577 setOperationAction(ISD::FABS , MVT::f32, Custom);
578
579 // Use XORP to simulate FNEG.
580 setOperationAction(ISD::FNEG , MVT::f32, Custom);
581
582 if (UseX87)
583 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
584
585 // Use ANDPS and ORPS to simulate FCOPYSIGN.
586 if (UseX87)
587 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
588 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
589
590 // We don't support sin/cos/fmod
591 setOperationAction(ISD::FSIN , MVT::f32, Expand);
592 setOperationAction(ISD::FCOS , MVT::f32, Expand);
593 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
594
595 if (UseX87) {
596 // Always expand sin/cos functions even though x87 has an instruction.
597 setOperationAction(ISD::FSIN, MVT::f64, Expand);
598 setOperationAction(ISD::FCOS, MVT::f64, Expand);
599 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
600 }
601 } else if (UseX87) {
602 // f32 and f64 in x87.
603 // Set up the FP register classes.
604 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
605 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
606
607 for (auto VT : { MVT::f32, MVT::f64 }) {
608 setOperationAction(ISD::UNDEF, VT, Expand);
609 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
610
611 // Always expand sin/cos functions even though x87 has an instruction.
612 setOperationAction(ISD::FSIN , VT, Expand);
613 setOperationAction(ISD::FCOS , VT, Expand);
614 setOperationAction(ISD::FSINCOS, VT, Expand);
615 }
616 }
617
618 // Expand FP32 immediates into loads from the stack, save special cases.
619 if (isTypeLegal(MVT::f32)) {
620 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
621 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
622 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
623 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
624 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
625 } else // SSE immediates.
626 addLegalFPImmediate(APFloat(+0.0f)); // xorps
627 }
628 // Expand FP64 immediates into loads from the stack, save special cases.
629 if (isTypeLegal(MVT::f64)) {
630 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
631 addLegalFPImmediate(APFloat(+0.0)); // FLD0
632 addLegalFPImmediate(APFloat(+1.0)); // FLD1
633 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
634 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
635 } else // SSE immediates.
636 addLegalFPImmediate(APFloat(+0.0)); // xorpd
637 }
638 // Handle constrained floating-point operations of scalar.
639 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
640 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
641 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
642 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
643 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
644 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
645 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
646 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
647 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
648 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
649 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
650 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
651 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
652
653 // We don't support FMA.
654 setOperationAction(ISD::FMA, MVT::f64, Expand);
655 setOperationAction(ISD::FMA, MVT::f32, Expand);
656
657 // f80 always uses X87.
658 if (UseX87) {
659 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
660 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
661 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
662 {
663 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
664 addLegalFPImmediate(TmpFlt); // FLD0
665 TmpFlt.changeSign();
666 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
667
668 bool ignored;
669 APFloat TmpFlt2(+1.0);
670 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
671 &ignored);
672 addLegalFPImmediate(TmpFlt2); // FLD1
673 TmpFlt2.changeSign();
674 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
675 }
676
677 // Always expand sin/cos functions even though x87 has an instruction.
678 setOperationAction(ISD::FSIN , MVT::f80, Expand);
679 setOperationAction(ISD::FCOS , MVT::f80, Expand);
680 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
681
682 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
683 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
684 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
685 setOperationAction(ISD::FRINT, MVT::f80, Expand);
686 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
687 setOperationAction(ISD::FMA, MVT::f80, Expand);
688 setOperationAction(ISD::LROUND, MVT::f80, Expand);
689 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
690 setOperationAction(ISD::LRINT, MVT::f80, Custom);
691 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
692
693 // Handle constrained floating-point operations of scalar.
694 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
695 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
696 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
697 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
698 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
699 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
700 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
701 // as Custom.
702 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
703 }
704
705 // f128 uses xmm registers, but most operations require libcalls.
706 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
707 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
708 : &X86::VR128RegClass);
709
710 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
711
712 setOperationAction(ISD::FADD, MVT::f128, LibCall);
713 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
714 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
715 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
716 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
717 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
718 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
719 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
720 setOperationAction(ISD::FMA, MVT::f128, LibCall);
721 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
722
723 setOperationAction(ISD::FABS, MVT::f128, Custom);
724 setOperationAction(ISD::FNEG, MVT::f128, Custom);
725 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
726
727 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
728 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
729 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
730 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
731 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
732 // No STRICT_FSINCOS
733 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
734 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
735
736 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
737 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
738 // We need to custom handle any FP_ROUND with an f128 input, but
739 // LegalizeDAG uses the result type to know when to run a custom handler.
740 // So we have to list all legal floating point result types here.
741 if (isTypeLegal(MVT::f32)) {
742 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
743 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
744 }
745 if (isTypeLegal(MVT::f64)) {
746 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
747 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
748 }
749 if (isTypeLegal(MVT::f80)) {
750 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
751 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
752 }
753
754 setOperationAction(ISD::SETCC, MVT::f128, Custom);
755
756 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
757 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
758 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
759 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
760 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
761 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
762 }
763
764 // Always use a library call for pow.
765 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
766 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
767 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
768 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
769
770 setOperationAction(ISD::FLOG, MVT::f80, Expand);
771 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
772 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
773 setOperationAction(ISD::FEXP, MVT::f80, Expand);
774 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
775 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
776 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
777
778 // Some FP actions are always expanded for vector types.
779 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
780 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
781 setOperationAction(ISD::FSIN, VT, Expand);
782 setOperationAction(ISD::FSINCOS, VT, Expand);
783 setOperationAction(ISD::FCOS, VT, Expand);
784 setOperationAction(ISD::FREM, VT, Expand);
785 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
786 setOperationAction(ISD::FPOW, VT, Expand);
787 setOperationAction(ISD::FLOG, VT, Expand);
788 setOperationAction(ISD::FLOG2, VT, Expand);
789 setOperationAction(ISD::FLOG10, VT, Expand);
790 setOperationAction(ISD::FEXP, VT, Expand);
791 setOperationAction(ISD::FEXP2, VT, Expand);
792 }
793
794 // First set operation action for all vector types to either promote
795 // (for widening) or expand (for scalarization). Then we will selectively
796 // turn on ones that can be effectively codegen'd.
797 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
798 setOperationAction(ISD::SDIV, VT, Expand);
799 setOperationAction(ISD::UDIV, VT, Expand);
800 setOperationAction(ISD::SREM, VT, Expand);
801 setOperationAction(ISD::UREM, VT, Expand);
802 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
804 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
805 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
806 setOperationAction(ISD::FMA, VT, Expand);
807 setOperationAction(ISD::FFLOOR, VT, Expand);
808 setOperationAction(ISD::FCEIL, VT, Expand);
809 setOperationAction(ISD::FTRUNC, VT, Expand);
810 setOperationAction(ISD::FRINT, VT, Expand);
811 setOperationAction(ISD::FNEARBYINT, VT, Expand);
812 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
813 setOperationAction(ISD::MULHS, VT, Expand);
814 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
815 setOperationAction(ISD::MULHU, VT, Expand);
816 setOperationAction(ISD::SDIVREM, VT, Expand);
817 setOperationAction(ISD::UDIVREM, VT, Expand);
818 setOperationAction(ISD::CTPOP, VT, Expand);
819 setOperationAction(ISD::CTTZ, VT, Expand);
820 setOperationAction(ISD::CTLZ, VT, Expand);
821 setOperationAction(ISD::ROTL, VT, Expand);
822 setOperationAction(ISD::ROTR, VT, Expand);
823 setOperationAction(ISD::BSWAP, VT, Expand);
824 setOperationAction(ISD::SETCC, VT, Expand);
825 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
826 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
827 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
828 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
829 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
830 setOperationAction(ISD::TRUNCATE, VT, Expand);
831 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
832 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
833 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
834 setOperationAction(ISD::SELECT_CC, VT, Expand);
835 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
836 setTruncStoreAction(InnerVT, VT, Expand);
837
838 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
839 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
840
841 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
842 // types, we have to deal with them whether we ask for Expansion or not.
843 // Setting Expand causes its own optimisation problems though, so leave
844 // them legal.
845 if (VT.getVectorElementType() == MVT::i1)
846 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
847
848 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
849 // split/scalarized right now.
850 if (VT.getVectorElementType() == MVT::f16)
851 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
852 }
853 }
854
855 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
856 // with -msoft-float, disable use of MMX as well.
857 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
858 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
859 // No operations on x86mmx supported, everything uses intrinsics.
860 }
861
862 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
863 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
864 : &X86::VR128RegClass);
865
866 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
867 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
868 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
869 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
870 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
871 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
872 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
873 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
874
875 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
876 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
877
878 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
879 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
880 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
881 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
882 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
883 }
884
885 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
886 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
890 // registers cannot be used even for integer operations.
891 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
892 : &X86::VR128RegClass);
893 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
894 : &X86::VR128RegClass);
895 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
896 : &X86::VR128RegClass);
897 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
898 : &X86::VR128RegClass);
899
900 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
901 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
902 setOperationAction(ISD::SDIV, VT, Custom);
903 setOperationAction(ISD::SREM, VT, Custom);
904 setOperationAction(ISD::UDIV, VT, Custom);
905 setOperationAction(ISD::UREM, VT, Custom);
906 }
907
908 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
909 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
910 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
911
912 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
913 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
914 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
915 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
916 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
917 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
918 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
919 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
920 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
921 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
922 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
923 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
924 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
925
926 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
927 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
928 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
929 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
930 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
931 }
932
933 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
934 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
935 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
936 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
937 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
938 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
939 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
940 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
941 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
942 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
943 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
944 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
945
946 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
947 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
949
950 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
951 setOperationAction(ISD::SETCC, VT, Custom);
952 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
953 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
954 setOperationAction(ISD::CTPOP, VT, Custom);
955 setOperationAction(ISD::ABS, VT, Custom);
956
957 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
958 // setcc all the way to isel and prefer SETGT in some isel patterns.
959 setCondCodeAction(ISD::SETLT, VT, Custom);
960 setCondCodeAction(ISD::SETLE, VT, Custom);
961 }
962
963 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
964 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
965 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
966 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
967 setOperationAction(ISD::VSELECT, VT, Custom);
968 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
969 }
970
971 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
972 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
973 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
974 setOperationAction(ISD::VSELECT, VT, Custom);
975
976 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
977 continue;
978
979 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
980 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
981 }
982
983 // Custom lower v2i64 and v2f64 selects.
984 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
985 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
986 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
987 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
988 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
989
990 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
991 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
992 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
993 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
994
995 // Custom legalize these to avoid over promotion or custom promotion.
996 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
997 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
998 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
999 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1000 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1001 }
1002
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1004 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1005 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1006 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1007
1008 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1009 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1010
1011 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1012 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1013
1014 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1015 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1016 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1017 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1018 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1019
1020 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1021 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1022 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1023 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1024
1025 // We want to legalize this to an f64 load rather than an i64 load on
1026 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1027 // store.
1028 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1029 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1030 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1031 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1032 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1034
1035 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1036 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1037 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1038 if (!Subtarget.hasAVX512())
1039 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1040
1041 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1042 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1043 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1044
1045 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1046
1047 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1048 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1049 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1050 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1051 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1052 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1053
1054 // In the customized shift lowering, the legal v4i32/v2i64 cases
1055 // in AVX2 will be recognized.
1056 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1057 setOperationAction(ISD::SRL, VT, Custom);
1058 setOperationAction(ISD::SHL, VT, Custom);
1059 setOperationAction(ISD::SRA, VT, Custom);
1060 }
1061
1062 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1063 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1064
1065 // With AVX512, expanding (and promoting the shifts) is better.
1066 if (!Subtarget.hasAVX512())
1067 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1068
1069 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1070 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1071 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1072 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1073 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1074 }
1075
1076 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1077 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1078 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1079 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1080 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1081 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1082 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1083 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1084 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1085
1086 // These might be better off as horizontal vector ops.
1087 setOperationAction(ISD::ADD, MVT::i16, Custom);
1088 setOperationAction(ISD::ADD, MVT::i32, Custom);
1089 setOperationAction(ISD::SUB, MVT::i16, Custom);
1090 setOperationAction(ISD::SUB, MVT::i32, Custom);
1091 }
1092
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1094 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1095 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1096 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1097 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1098 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1099 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1100 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1101 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1102 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1103 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1104 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1105
1106 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1107 }
1108
1109 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1110 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1111 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1112 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1113 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1114 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1115 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1116 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1117
1118 // FIXME: Do we need to handle scalar-to-vector here?
1119 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1120
1121 // We directly match byte blends in the backend as they match the VSELECT
1122 // condition form.
1123 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1124
1125 // SSE41 brings specific instructions for doing vector sign extend even in
1126 // cases where we don't have SRA.
1127 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1128 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1129 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1130 }
1131
1132 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1133 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1134 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1135 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1136 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1137 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1138 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1139 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1140 }
1141
1142 // i8 vectors are custom because the source register and source
1143 // source memory operand types are not the same width.
1144 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1145
1146 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1147 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1148 // do the pre and post work in the vector domain.
1149 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1150 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1151 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1152 // so that DAG combine doesn't try to turn it into uint_to_fp.
1153 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1154 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1155 }
1156 }
1157
1158 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1159 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1160 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1161 setOperationAction(ISD::ROTL, VT, Custom);
1162
1163 // XOP can efficiently perform BITREVERSE with VPPERM.
1164 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1165 setOperationAction(ISD::BITREVERSE, VT, Custom);
1166
1167 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1168 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1169 setOperationAction(ISD::BITREVERSE, VT, Custom);
1170 }
1171
1172 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1173 bool HasInt256 = Subtarget.hasInt256();
1174
1175 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1176 : &X86::VR256RegClass);
1177 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1178 : &X86::VR256RegClass);
1179 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1180 : &X86::VR256RegClass);
1181 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1182 : &X86::VR256RegClass);
1183 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1184 : &X86::VR256RegClass);
1185 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1186 : &X86::VR256RegClass);
1187
1188 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1189 setOperationAction(ISD::FFLOOR, VT, Legal);
1190 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1191 setOperationAction(ISD::FCEIL, VT, Legal);
1192 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1193 setOperationAction(ISD::FTRUNC, VT, Legal);
1194 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1195 setOperationAction(ISD::FRINT, VT, Legal);
1196 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1197 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1198 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1199
1200 setOperationAction(ISD::FROUND, VT, Custom);
1201
1202 setOperationAction(ISD::FNEG, VT, Custom);
1203 setOperationAction(ISD::FABS, VT, Custom);
1204 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1205 }
1206
1207 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1208 // even though v8i16 is a legal type.
1209 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1210 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1211 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1212 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1213 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1214 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1215
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1217 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1218
1219 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1220 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1221 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1222 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1223 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1224 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1225 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1226 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1227 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1228 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1229 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1230 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1231
1232 if (!Subtarget.hasAVX512())
1233 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1234
1235 // In the customized shift lowering, the legal v8i32/v4i64 cases
1236 // in AVX2 will be recognized.
1237 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1238 setOperationAction(ISD::SRL, VT, Custom);
1239 setOperationAction(ISD::SHL, VT, Custom);
1240 setOperationAction(ISD::SRA, VT, Custom);
1241 }
1242
1243 // These types need custom splitting if their input is a 128-bit vector.
1244 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1245 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1246 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1247 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1248
1249 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1250 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1251
1252 // With BWI, expanding (and promoting the shifts) is the better.
1253 if (!Subtarget.hasBWI())
1254 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1255
1256 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1257 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1258 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1259 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1260 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1261 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1262
1263 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1264 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1265 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1266 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1267 }
1268
1269 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1270 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1271 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1272 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1273
1274 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1275 setOperationAction(ISD::SETCC, VT, Custom);
1276 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1277 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1278 setOperationAction(ISD::CTPOP, VT, Custom);
1279 setOperationAction(ISD::CTLZ, VT, Custom);
1280
1281 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1282 // setcc all the way to isel and prefer SETGT in some isel patterns.
1283 setCondCodeAction(ISD::SETLT, VT, Custom);
1284 setCondCodeAction(ISD::SETLE, VT, Custom);
1285 }
1286
1287 if (Subtarget.hasAnyFMA()) {
1288 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1289 MVT::v2f64, MVT::v4f64 }) {
1290 setOperationAction(ISD::FMA, VT, Legal);
1291 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1292 }
1293 }
1294
1295 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1296 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1297 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1298 }
1299
1300 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1301 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1302 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1303 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1304
1305 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1306 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1307 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1308 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1309 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1310 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1311
1312 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1313 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1314 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1315 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1316 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1317
1318 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1319 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1320 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1321 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1322 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1323 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1324 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1325 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1326
1327 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1328 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1329 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1330 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1331 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1332 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1333 }
1334
1335 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1336 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1337 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1338 }
1339
1340 if (HasInt256) {
1341 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1342 // when we have a 256bit-wide blend with immediate.
1343 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1344 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1345
1346 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1347 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1348 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1349 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1350 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1351 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1352 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1353 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1354 }
1355 }
1356
1357 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1358 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1359 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1360 setOperationAction(ISD::MSTORE, VT, Legal);
1361 }
1362
1363 // Extract subvector is special because the value type
1364 // (result) is 128-bit but the source is 256-bit wide.
1365 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1366 MVT::v4f32, MVT::v2f64 }) {
1367 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1368 }
1369
1370 // Custom lower several nodes for 256-bit types.
1371 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1372 MVT::v8f32, MVT::v4f64 }) {
1373 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1374 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1375 setOperationAction(ISD::VSELECT, VT, Custom);
1376 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1377 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1378 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1379 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1380 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1381 setOperationAction(ISD::STORE, VT, Custom);
1382 }
1383
1384 if (HasInt256) {
1385 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1386
1387 // Custom legalize 2x32 to get a little better code.
1388 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1389 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1390
1391 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1392 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1393 setOperationAction(ISD::MGATHER, VT, Custom);
1394 }
1395 }
1396
1397 // This block controls legalization of the mask vector sizes that are
1398 // available with AVX512. 512-bit vectors are in a separate block controlled
1399 // by useAVX512Regs.
1400 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1401 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1402 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1403 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1404 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1405 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1406
1407 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1408 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1409 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1410
1411 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1412 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1413 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1414 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1415 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1416 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1417 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1418 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1419 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1420 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1421 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1422 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1423
1424 // There is no byte sized k-register load or store without AVX512DQ.
1425 if (!Subtarget.hasDQI()) {
1426 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1427 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1428 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1429 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1430
1431 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1432 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1433 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1434 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1435 }
1436
1437 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1438 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1439 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1440 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1441 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1442 }
1443
1444 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1445 setOperationAction(ISD::ADD, VT, Custom);
1446 setOperationAction(ISD::SUB, VT, Custom);
1447 setOperationAction(ISD::MUL, VT, Custom);
1448 setOperationAction(ISD::UADDSAT, VT, Custom);
1449 setOperationAction(ISD::SADDSAT, VT, Custom);
1450 setOperationAction(ISD::USUBSAT, VT, Custom);
1451 setOperationAction(ISD::SSUBSAT, VT, Custom);
1452 setOperationAction(ISD::VSELECT, VT, Expand);
1453 }
1454
1455 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1456 setOperationAction(ISD::SETCC, VT, Custom);
1457 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1458 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1459 setOperationAction(ISD::SELECT, VT, Custom);
1460 setOperationAction(ISD::TRUNCATE, VT, Custom);
1461
1462 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1463 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1464 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1465 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1466 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1467 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1468 }
1469
1470 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1471 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1472 }
1473
1474 // This block controls legalization for 512-bit operations with 32/64 bit
1475 // elements. 512-bits can be disabled based on prefer-vector-width and
1476 // required-vector-width function attributes.
1477 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1478 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1479 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1480 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1481 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1482
1483 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1484 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1485 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1486 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1487 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1488 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1489 }
1490
1491 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1492 setOperationAction(ISD::FNEG, VT, Custom);
1493 setOperationAction(ISD::FABS, VT, Custom);
1494 setOperationAction(ISD::FMA, VT, Legal);
1495 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1496 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1497 }
1498
1499 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1500 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1501 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1502 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1504 }
1505 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1506 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1507 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1508 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1509 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1510 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1511 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1512 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1513
1514 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1515 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1516 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1517 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1518 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1519 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1520 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1521 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1522 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1523 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1524 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1525 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1526
1527 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1528 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1529 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1530 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1531 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1532
1533 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1534 // to 512-bit rather than use the AVX2 instructions so that we can use
1535 // k-masks.
1536 if (!Subtarget.hasVLX()) {
1537 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1538 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1539 setOperationAction(ISD::MLOAD, VT, Custom);
1540 setOperationAction(ISD::MSTORE, VT, Custom);
1541 }
1542 }
1543
1544 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1545 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1546 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1547 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1548 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1549 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1550 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1551 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1552
1553 // Need to custom widen this if we don't have AVX512BW.
1554 setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
1555 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
1556 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
1557
1558 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1559 setOperationAction(ISD::FFLOOR, VT, Legal);
1560 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1561 setOperationAction(ISD::FCEIL, VT, Legal);
1562 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1563 setOperationAction(ISD::FTRUNC, VT, Legal);
1564 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1565 setOperationAction(ISD::FRINT, VT, Legal);
1566 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1567 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1568 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1569
1570 setOperationAction(ISD::FROUND, VT, Custom);
1571
1572 setOperationAction(ISD::SELECT, VT, Custom);
1573 }
1574
1575 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1576 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
1577 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1578 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1579 }
1580
1581 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1582 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1583 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1584 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1585
1586 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1587 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1588
1589 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1590 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1591
1592 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1593 setOperationAction(ISD::SMAX, VT, Legal);
1594 setOperationAction(ISD::UMAX, VT, Legal);
1595 setOperationAction(ISD::SMIN, VT, Legal);
1596 setOperationAction(ISD::UMIN, VT, Legal);
1597 setOperationAction(ISD::ABS, VT, Legal);
1598 setOperationAction(ISD::SRL, VT, Custom);
1599 setOperationAction(ISD::SHL, VT, Custom);
1600 setOperationAction(ISD::SRA, VT, Custom);
1601 setOperationAction(ISD::CTPOP, VT, Custom);
1602 setOperationAction(ISD::ROTL, VT, Custom);
1603 setOperationAction(ISD::ROTR, VT, Custom);
1604 setOperationAction(ISD::SETCC, VT, Custom);
1605 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1606 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1607 setOperationAction(ISD::SELECT, VT, Custom);
1608
1609 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1610 // setcc all the way to isel and prefer SETGT in some isel patterns.
1611 setCondCodeAction(ISD::SETLT, VT, Custom);
1612 setCondCodeAction(ISD::SETLE, VT, Custom);
1613 }
1614
1615 if (Subtarget.hasDQI()) {
1616 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1617 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1618 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1619 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1620 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1621 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1622 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1623 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1624
1625 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1626 }
1627
1628 if (Subtarget.hasCDI()) {
1629 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1630 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1631 setOperationAction(ISD::CTLZ, VT, Legal);
1632 }
1633 } // Subtarget.hasCDI()
1634
1635 if (Subtarget.hasVPOPCNTDQ()) {
1636 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1637 setOperationAction(ISD::CTPOP, VT, Legal);
1638 }
1639
1640 // Extract subvector is special because the value type
1641 // (result) is 256-bit but the source is 512-bit wide.
1642 // 128-bit was made Legal under AVX1.
1643 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1644 MVT::v8f32, MVT::v4f64 })
1645 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1646
1647 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1648 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1649 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1650 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1651 setOperationAction(ISD::VSELECT, VT, Custom);
1652 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1653 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1654 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1655 setOperationAction(ISD::MLOAD, VT, Legal);
1656 setOperationAction(ISD::MSTORE, VT, Legal);
1657 setOperationAction(ISD::MGATHER, VT, Custom);
1658 setOperationAction(ISD::MSCATTER, VT, Custom);
1659 }
1660 if (!Subtarget.hasBWI()) {
1661 // Need to custom split v32i16/v64i8 bitcasts.
1662 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1663 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1664
1665 // Better to split these into two 256-bit ops.
1666 setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
1667 setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
1668 }
1669
1670 if (Subtarget.hasVBMI2()) {
1671 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1672 setOperationAction(ISD::FSHL, VT, Custom);
1673 setOperationAction(ISD::FSHR, VT, Custom);
1674 }
1675 }
1676 }// has AVX-512
1677
1678 // This block controls legalization for operations that don't have
1679 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1680 // narrower widths.
1681 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1682 // These operations are handled on non-VLX by artificially widening in
1683 // isel patterns.
1684
1685 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1686 Subtarget.hasVLX() ? Legal : Custom);
1687 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1688 Subtarget.hasVLX() ? Legal : Custom);
1689 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1690 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1691 Subtarget.hasVLX() ? Legal : Custom);
1692 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1693 Subtarget.hasVLX() ? Legal : Custom);
1694 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1695 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1696 Subtarget.hasVLX() ? Legal : Custom);
1697 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1698 Subtarget.hasVLX() ? Legal : Custom);
1699 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1700 Subtarget.hasVLX() ? Legal : Custom);
1701 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1702 Subtarget.hasVLX() ? Legal : Custom);
1703
1704 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1705 setOperationAction(ISD::SMAX, VT, Legal);
1706 setOperationAction(ISD::UMAX, VT, Legal);
1707 setOperationAction(ISD::SMIN, VT, Legal);
1708 setOperationAction(ISD::UMIN, VT, Legal);
1709 setOperationAction(ISD::ABS, VT, Legal);
1710 }
1711
1712 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1713 setOperationAction(ISD::ROTL, VT, Custom);
1714 setOperationAction(ISD::ROTR, VT, Custom);
1715 }
1716
1717 // Custom legalize 2x32 to get a little better code.
1718 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1719 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1720
1721 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1722 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1723 setOperationAction(ISD::MSCATTER, VT, Custom);
1724
1725 if (Subtarget.hasDQI()) {
1726 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1727 setOperationAction(ISD::SINT_TO_FP, VT,
1728 Subtarget.hasVLX() ? Legal : Custom);
1729 setOperationAction(ISD::UINT_TO_FP, VT,
1730 Subtarget.hasVLX() ? Legal : Custom);
1731 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1732 Subtarget.hasVLX() ? Legal : Custom);
1733 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1734 Subtarget.hasVLX() ? Legal : Custom);
1735 setOperationAction(ISD::FP_TO_SINT, VT,
1736 Subtarget.hasVLX() ? Legal : Custom);
1737 setOperationAction(ISD::FP_TO_UINT, VT,
1738 Subtarget.hasVLX() ? Legal : Custom);
1739 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1740 Subtarget.hasVLX() ? Legal : Custom);
1741 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1742 Subtarget.hasVLX() ? Legal : Custom);
1743 setOperationAction(ISD::MUL, VT, Legal);
1744 }
1745 }
1746
1747 if (Subtarget.hasCDI()) {
1748 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1749 setOperationAction(ISD::CTLZ, VT, Legal);
1750 }
1751 } // Subtarget.hasCDI()
1752
1753 if (Subtarget.hasVPOPCNTDQ()) {
1754 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1755 setOperationAction(ISD::CTPOP, VT, Legal);
1756 }
1757 }
1758
1759 // This block control legalization of v32i1/v64i1 which are available with
1760 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1761 // useBWIRegs.
1762 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1763 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1764 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1765
1766 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1767 setOperationAction(ISD::ADD, VT, Custom);
1768 setOperationAction(ISD::SUB, VT, Custom);
1769 setOperationAction(ISD::MUL, VT, Custom);
1770 setOperationAction(ISD::VSELECT, VT, Expand);
1771 setOperationAction(ISD::UADDSAT, VT, Custom);
1772 setOperationAction(ISD::SADDSAT, VT, Custom);
1773 setOperationAction(ISD::USUBSAT, VT, Custom);
1774 setOperationAction(ISD::SSUBSAT, VT, Custom);
1775
1776 setOperationAction(ISD::TRUNCATE, VT, Custom);
1777 setOperationAction(ISD::SETCC, VT, Custom);
1778 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1779 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1780 setOperationAction(ISD::SELECT, VT, Custom);
1781 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1782 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1783 }
1784
1785 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1786 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1787 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1788 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1789 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1790 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1791
1792 // Extends from v32i1 masks to 256-bit vectors.
1793 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1794 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1795 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1796 }
1797
1798 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1799 // disabled based on prefer-vector-width and required-vector-width function
1800 // attributes.
1801 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1802 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1804
1805 // Extends from v64i1 masks to 512-bit vectors.
1806 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1807 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1808 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1809
1810 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1811 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1812 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1813 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1814 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1815 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1816 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1817 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1818 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1819 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1821 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1822 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1823 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1824 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1825 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1826 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1827 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1828 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1829 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1830 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1831 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1832 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1833
1834 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1835 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1836
1837 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1838
1839 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1840 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1841 setOperationAction(ISD::VSELECT, VT, Custom);
1842 setOperationAction(ISD::ABS, VT, Legal);
1843 setOperationAction(ISD::SRL, VT, Custom);
1844 setOperationAction(ISD::SHL, VT, Custom);
1845 setOperationAction(ISD::SRA, VT, Custom);
1846 setOperationAction(ISD::MLOAD, VT, Legal);
1847 setOperationAction(ISD::MSTORE, VT, Legal);
1848 setOperationAction(ISD::CTPOP, VT, Custom);
1849 setOperationAction(ISD::CTLZ, VT, Custom);
1850 setOperationAction(ISD::SMAX, VT, Legal);
1851 setOperationAction(ISD::UMAX, VT, Legal);
1852 setOperationAction(ISD::SMIN, VT, Legal);
1853 setOperationAction(ISD::UMIN, VT, Legal);
1854 setOperationAction(ISD::SETCC, VT, Custom);
1855 setOperationAction(ISD::UADDSAT, VT, Legal);
1856 setOperationAction(ISD::SADDSAT, VT, Legal);
1857 setOperationAction(ISD::USUBSAT, VT, Legal);
1858 setOperationAction(ISD::SSUBSAT, VT, Legal);
1859 setOperationAction(ISD::SELECT, VT, Custom);
1860
1861 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1862 // setcc all the way to isel and prefer SETGT in some isel patterns.
1863 setCondCodeAction(ISD::SETLT, VT, Custom);
1864 setCondCodeAction(ISD::SETLE, VT, Custom);
1865 }
1866
1867 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1868 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1869 }
1870
1871 if (Subtarget.hasBITALG()) {
1872 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1873 setOperationAction(ISD::CTPOP, VT, Legal);
1874 }
1875
1876 if (Subtarget.hasVBMI2()) {
1877 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1878 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1879 }
1880 }
1881
1882 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1883 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1884 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1885 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1886 }
1887
1888 // These operations are handled on non-VLX by artificially widening in
1889 // isel patterns.
1890 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1891
1892 if (Subtarget.hasBITALG()) {
1893 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1894 setOperationAction(ISD::CTPOP, VT, Legal);
1895 }
1896 }
1897
1898 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1899 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1900 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1901 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1902 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1903 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1904
1905 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1906 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1907 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1908 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1909 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1910
1911 if (Subtarget.hasDQI()) {
1912 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1913 // v2f32 UINT_TO_FP is already custom under SSE2.
1914 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1916, __PRETTY_FUNCTION__))
1915 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1916, __PRETTY_FUNCTION__))
1916 "Unexpected operation action!")((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1916, __PRETTY_FUNCTION__))
;
1917 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1918 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1919 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1920 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1921 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1922 }
1923
1924 if (Subtarget.hasBWI()) {
1925 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1926 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1927 }
1928
1929 if (Subtarget.hasVBMI2()) {
1930 // TODO: Make these legal even without VLX?
1931 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1932 MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1933 setOperationAction(ISD::FSHL, VT, Custom);
1934 setOperationAction(ISD::FSHR, VT, Custom);
1935 }
1936 }
1937
1938 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1939 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1940 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1941 }
1942
1943 // We want to custom lower some of our intrinsics.
1944 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1945 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1946 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1947 if (!Subtarget.is64Bit()) {
1948 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1949 }
1950
1951 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1952 // handle type legalization for these operations here.
1953 //
1954 // FIXME: We really should do custom legalization for addition and
1955 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1956 // than generic legalization for 64-bit multiplication-with-overflow, though.
1957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1958 if (VT == MVT::i64 && !Subtarget.is64Bit())
1959 continue;
1960 // Add/Sub/Mul with overflow operations are custom lowered.
1961 setOperationAction(ISD::SADDO, VT, Custom);
1962 setOperationAction(ISD::UADDO, VT, Custom);
1963 setOperationAction(ISD::SSUBO, VT, Custom);
1964 setOperationAction(ISD::USUBO, VT, Custom);
1965 setOperationAction(ISD::SMULO, VT, Custom);
1966 setOperationAction(ISD::UMULO, VT, Custom);
1967
1968 // Support carry in as value rather than glue.
1969 setOperationAction(ISD::ADDCARRY, VT, Custom);
1970 setOperationAction(ISD::SUBCARRY, VT, Custom);
1971 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1972 }
1973
1974 if (!Subtarget.is64Bit()) {
1975 // These libcalls are not available in 32-bit.
1976 setLibcallName(RTLIB::SHL_I128, nullptr);
1977 setLibcallName(RTLIB::SRL_I128, nullptr);
1978 setLibcallName(RTLIB::SRA_I128, nullptr);
1979 setLibcallName(RTLIB::MUL_I128, nullptr);
1980 }
1981
1982 // Combine sin / cos into _sincos_stret if it is available.
1983 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1984 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1985 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1986 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1987 }
1988
1989 if (Subtarget.isTargetWin64()) {
1990 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1991 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1992 setOperationAction(ISD::SREM, MVT::i128, Custom);
1993 setOperationAction(ISD::UREM, MVT::i128, Custom);
1994 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1995 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1996 }
1997
1998 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1999 // is. We should promote the value to 64-bits to solve this.
2000 // This is what the CRT headers do - `fmodf` is an inline header
2001 // function casting to f64 and calling `fmod`.
2002 if (Subtarget.is32Bit() &&
2003 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2004 for (ISD::NodeType Op :
2005 {ISD::FCEIL, ISD::STRICT_FCEIL,
2006 ISD::FCOS, ISD::STRICT_FCOS,
2007 ISD::FEXP, ISD::STRICT_FEXP,
2008 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2009 ISD::FREM, ISD::STRICT_FREM,
2010 ISD::FLOG, ISD::STRICT_FLOG,
2011 ISD::FLOG10, ISD::STRICT_FLOG10,
2012 ISD::FPOW, ISD::STRICT_FPOW,
2013 ISD::FSIN, ISD::STRICT_FSIN})
2014 if (isOperationExpand(Op, MVT::f32))
2015 setOperationAction(Op, MVT::f32, Promote);
2016
2017 // We have target-specific dag combine patterns for the following nodes:
2018 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2019 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2020 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2021 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2022 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2023 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2024 setTargetDAGCombine(ISD::BITCAST);
2025 setTargetDAGCombine(ISD::VSELECT);
2026 setTargetDAGCombine(ISD::SELECT);
2027 setTargetDAGCombine(ISD::SHL);
2028 setTargetDAGCombine(ISD::SRA);
2029 setTargetDAGCombine(ISD::SRL);
2030 setTargetDAGCombine(ISD::OR);
2031 setTargetDAGCombine(ISD::AND);
2032 setTargetDAGCombine(ISD::ADD);
2033 setTargetDAGCombine(ISD::FADD);
2034 setTargetDAGCombine(ISD::FSUB);
2035 setTargetDAGCombine(ISD::FNEG);
2036 setTargetDAGCombine(ISD::FMA);
2037 setTargetDAGCombine(ISD::STRICT_FMA);
2038 setTargetDAGCombine(ISD::FMINNUM);
2039 setTargetDAGCombine(ISD::FMAXNUM);
2040 setTargetDAGCombine(ISD::SUB);
2041 setTargetDAGCombine(ISD::LOAD);
2042 setTargetDAGCombine(ISD::MLOAD);
2043 setTargetDAGCombine(ISD::STORE);
2044 setTargetDAGCombine(ISD::MSTORE);
2045 setTargetDAGCombine(ISD::TRUNCATE);
2046 setTargetDAGCombine(ISD::ZERO_EXTEND);
2047 setTargetDAGCombine(ISD::ANY_EXTEND);
2048 setTargetDAGCombine(ISD::SIGN_EXTEND);
2049 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2050 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2051 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2052 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2053 setTargetDAGCombine(ISD::SINT_TO_FP);
2054 setTargetDAGCombine(ISD::UINT_TO_FP);
2055 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2056 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2057 setTargetDAGCombine(ISD::SETCC);
2058 setTargetDAGCombine(ISD::MUL);
2059 setTargetDAGCombine(ISD::XOR);
2060 setTargetDAGCombine(ISD::MSCATTER);
2061 setTargetDAGCombine(ISD::MGATHER);
2062 setTargetDAGCombine(ISD::FP16_TO_FP);
2063 setTargetDAGCombine(ISD::FP_EXTEND);
2064 setTargetDAGCombine(ISD::FP_ROUND);
2065
2066 computeRegisterProperties(Subtarget.getRegisterInfo());
2067
2068 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2069 MaxStoresPerMemsetOptSize = 8;
2070 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2071 MaxStoresPerMemcpyOptSize = 4;
2072 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2073 MaxStoresPerMemmoveOptSize = 4;
2074
2075 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2076 // that needs to benchmarked and balanced with the potential use of vector
2077 // load/store types (PR33329, PR33914).
2078 MaxLoadsPerMemcmp = 2;
2079 MaxLoadsPerMemcmpOptSize = 2;
2080
2081 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2082 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2083
2084 // An out-of-order CPU can speculatively execute past a predictable branch,
2085 // but a conditional move could be stalled by an expensive earlier operation.
2086 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2087 EnableExtLdPromotion = true;
2088 setPrefFunctionAlignment(Align(16));
2089
2090 verifyIntrinsicTables();
2091
2092 // Default to having -disable-strictnode-mutation on
2093 IsStrictFPEnabled = true;
2094}
2095
2096// This has so far only been implemented for 64-bit MachO.
2097bool X86TargetLowering::useLoadStackGuardNode() const {
2098 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2099}
2100
2101bool X86TargetLowering::useStackGuardXorFP() const {
2102 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2103 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2104}
2105
2106SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2107 const SDLoc &DL) const {
2108 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2109 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2110 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2111 return SDValue(Node, 0);
2112}
2113
2114TargetLoweringBase::LegalizeTypeAction
2115X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2116 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
2117 return TypeSplitVector;
2118
2119 if (VT.getVectorNumElements() != 1 &&
2120 VT.getVectorElementType() != MVT::i1)
2121 return TypeWidenVector;
2122
2123 return TargetLoweringBase::getPreferredVectorAction(VT);
2124}
2125
2126MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2127 CallingConv::ID CC,
2128 EVT VT) const {
2129 // v32i1 vectors should be promoted to v32i8 to match avx2.
2130 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
2131 return MVT::v32i8;
2132 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2133 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2134 Subtarget.hasAVX512() &&
2135 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2136 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
2137 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
2138 return MVT::i8;
2139 // Split v64i1 vectors if we don't have v64i8 available.
2140 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2141 CC != CallingConv::X86_RegCall)
2142 return MVT::v32i1;
2143 // FIXME: Should we just make these types legal and custom split operations?
2144 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
2145 Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
2146 return MVT::v16i32;
2147 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2148}
2149
2150unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2151 CallingConv::ID CC,
2152 EVT VT) const {
2153 // v32i1 vectors should be promoted to v32i8 to match avx2.
2154 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
2155 return 1;
2156 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2157 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2158 Subtarget.hasAVX512() &&
2159 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2160 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
2161 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
2162 return VT.getVectorNumElements();
2163 // Split v64i1 vectors if we don't have v64i8 available.
2164 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2165 CC != CallingConv::X86_RegCall)
2166 return 2;
2167 // FIXME: Should we just make these types legal and custom split operations?
2168 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
2169 Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
2170 return 1;
2171 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2172}
2173
2174unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2175 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2176 unsigned &NumIntermediates, MVT &RegisterVT) const {
2177 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2178 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2179 Subtarget.hasAVX512() &&
2180 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2181 (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
2182 (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
2183 RegisterVT = MVT::i8;
2184 IntermediateVT = MVT::i1;
2185 NumIntermediates = VT.getVectorNumElements();
2186 return NumIntermediates;
2187 }
2188
2189 // Split v64i1 vectors if we don't have v64i8 available.
2190 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2191 CC != CallingConv::X86_RegCall) {
2192 RegisterVT = MVT::v32i1;
2193 IntermediateVT = MVT::v32i1;
2194 NumIntermediates = 2;
2195 return 2;
2196 }
2197
2198 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2199 NumIntermediates, RegisterVT);
2200}
2201
2202EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2203 LLVMContext& Context,
2204 EVT VT) const {
2205 if (!VT.isVector())
2206 return MVT::i8;
2207
2208 if (Subtarget.hasAVX512()) {
2209 const unsigned NumElts = VT.getVectorNumElements();
2210
2211 // Figure out what this type will be legalized to.
2212 EVT LegalVT = VT;
2213 while (getTypeAction(Context, LegalVT) != TypeLegal)
2214 LegalVT = getTypeToTransformTo(Context, LegalVT);
2215
2216 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2217 if (LegalVT.getSimpleVT().is512BitVector())
2218 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2219
2220 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2221 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2222 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2223 // vXi16/vXi8.
2224 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2225 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2226 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2227 }
2228 }
2229
2230 return VT.changeVectorElementTypeToInteger();
2231}
2232
2233/// Helper for getByValTypeAlignment to determine
2234/// the desired ByVal argument alignment.
2235static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
2236 if (MaxAlign == 16)
2237 return;
2238 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2239 if (VTy->getBitWidth() == 128)
2240 MaxAlign = 16;
2241 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2242 unsigned EltAlign = 0;
2243 getMaxByValAlign(ATy->getElementType(), EltAlign);
2244 if (EltAlign > MaxAlign)
2245 MaxAlign = EltAlign;
2246 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2247 for (auto *EltTy : STy->elements()) {
2248 unsigned EltAlign = 0;
2249 getMaxByValAlign(EltTy, EltAlign);
2250 if (EltAlign > MaxAlign)
2251 MaxAlign = EltAlign;
2252 if (MaxAlign == 16)
2253 break;
2254 }
2255 }
2256}
2257
2258/// Return the desired alignment for ByVal aggregate
2259/// function arguments in the caller parameter area. For X86, aggregates
2260/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2261/// are at 4-byte boundaries.
2262unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2263 const DataLayout &DL) const {
2264 if (Subtarget.is64Bit()) {
2265 // Max of 8 and alignment of type.
2266 unsigned TyAlign = DL.getABITypeAlignment(Ty);
2267 if (TyAlign > 8)
2268 return TyAlign;
2269 return 8;
2270 }
2271
2272 unsigned Align = 4;
2273 if (Subtarget.hasSSE1())
2274 getMaxByValAlign(Ty, Align);
2275 return Align;
2276}
2277
2278/// It returns EVT::Other if the type should be determined using generic
2279/// target-independent logic.
2280/// For vector ops we check that the overall size isn't larger than our
2281/// preferred vector width.
2282EVT X86TargetLowering::getOptimalMemOpType(
2283 const MemOp &Op, const AttributeList &FuncAttributes) const {
2284 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2285 if (Op.size() >= 16 &&
2286 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2287 // FIXME: Check if unaligned 64-byte accesses are slow.
2288 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2289 (Subtarget.getPreferVectorWidth() >= 512)) {
2290 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2291 }
2292 // FIXME: Check if unaligned 32-byte accesses are slow.
2293 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2294 (Subtarget.getPreferVectorWidth() >= 256)) {
2295 // Although this isn't a well-supported type for AVX1, we'll let
2296 // legalization and shuffle lowering produce the optimal codegen. If we
2297 // choose an optimal type with a vector element larger than a byte,
2298 // getMemsetStores() may create an intermediate splat (using an integer
2299 // multiply) before we splat as a vector.
2300 return MVT::v32i8;
2301 }
2302 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2303 return MVT::v16i8;
2304 // TODO: Can SSE1 handle a byte vector?
2305 // If we have SSE1 registers we should be able to use them.
2306 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2307 (Subtarget.getPreferVectorWidth() >= 128))
2308 return MVT::v4f32;
2309 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2310 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2311 // Do not use f64 to lower memcpy if source is string constant. It's
2312 // better to use i32 to avoid the loads.
2313 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2314 // The gymnastics of splatting a byte value into an XMM register and then
2315 // only using 8-byte stores (because this is a CPU with slow unaligned
2316 // 16-byte accesses) makes that a loser.
2317 return MVT::f64;
2318 }
2319 }
2320 // This is a compromise. If we reach here, unaligned accesses may be slow on
2321 // this target. However, creating smaller, aligned accesses could be even
2322 // slower and would certainly be a lot more code.
2323 if (Subtarget.is64Bit() && Op.size() >= 8)
2324 return MVT::i64;
2325 return MVT::i32;
2326}
2327
2328bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2329 if (VT == MVT::f32)
2330 return X86ScalarSSEf32;
2331 else if (VT == MVT::f64)
2332 return X86ScalarSSEf64;
2333 return true;
2334}
2335
2336bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2337 EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
2338 bool *Fast) const {
2339 if (Fast) {
2340 switch (VT.getSizeInBits()) {
2341 default:
2342 // 8-byte and under are always assumed to be fast.
2343 *Fast = true;
2344 break;
2345 case 128:
2346 *Fast = !Subtarget.isUnalignedMem16Slow();
2347 break;
2348 case 256:
2349 *Fast = !Subtarget.isUnalignedMem32Slow();
2350 break;
2351 // TODO: What about AVX-512 (512-bit) accesses?
2352 }
2353 }
2354 // NonTemporal vector memory ops must be aligned.
2355 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2356 // NT loads can only be vector aligned, so if its less aligned than the
2357 // minimum vector size (which we can split the vector down to), we might as
2358 // well use a regular unaligned vector load.
2359 // We don't have any NT loads pre-SSE41.
2360 if (!!(Flags & MachineMemOperand::MOLoad))
2361 return (Align < 16 || !Subtarget.hasSSE41());
2362 return false;
2363 }
2364 // Misaligned accesses of any size are always allowed.
2365 return true;
2366}
2367
2368/// Return the entry encoding for a jump table in the
2369/// current function. The returned value is a member of the
2370/// MachineJumpTableInfo::JTEntryKind enum.
2371unsigned X86TargetLowering::getJumpTableEncoding() const {
2372 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2373 // symbol.
2374 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2375 return MachineJumpTableInfo::EK_Custom32;
2376
2377 // Otherwise, use the normal jump table encoding heuristics.
2378 return TargetLowering::getJumpTableEncoding();
2379}
2380
2381bool X86TargetLowering::useSoftFloat() const {
2382 return Subtarget.useSoftFloat();
2383}
2384
2385void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2386 ArgListTy &Args) const {
2387
2388 // Only relabel X86-32 for C / Stdcall CCs.
2389 if (Subtarget.is64Bit())
2390 return;
2391 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2392 return;
2393 unsigned ParamRegs = 0;
2394 if (auto *M = MF->getFunction().getParent())
2395 ParamRegs = M->getNumberRegisterParameters();
2396
2397 // Mark the first N int arguments as having reg
2398 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2399 Type *T = Args[Idx].Ty;
2400 if (T->isIntOrPtrTy())
2401 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2402 unsigned numRegs = 1;
2403 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2404 numRegs = 2;
2405 if (ParamRegs < numRegs)
2406 return;
2407 ParamRegs -= numRegs;
2408 Args[Idx].IsInReg = true;
2409 }
2410 }
2411}
2412
2413const MCExpr *
2414X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2415 const MachineBasicBlock *MBB,
2416 unsigned uid,MCContext &Ctx) const{
2417 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT(
)) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2417, __PRETTY_FUNCTION__))
;
2418 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2419 // entries.
2420 return MCSymbolRefExpr::create(MBB->getSymbol(),
2421 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2422}
2423
2424/// Returns relocation base for the given PIC jumptable.
2425SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2426 SelectionDAG &DAG) const {
2427 if (!Subtarget.is64Bit())
2428 // This doesn't have SDLoc associated with it, but is not really the
2429 // same as a Register.
2430 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2431 getPointerTy(DAG.getDataLayout()));
2432 return Table;
2433}
2434
2435/// This returns the relocation base for the given PIC jumptable,
2436/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2437const MCExpr *X86TargetLowering::
2438getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2439 MCContext &Ctx) const {
2440 // X86-64 uses RIP relative addressing based on the jump table label.
2441 if (Subtarget.isPICStyleRIPRel())
2442 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2443
2444 // Otherwise, the reference is relative to the PIC base.
2445 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2446}
2447
2448std::pair<const TargetRegisterClass *, uint8_t>
2449X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2450 MVT VT) const {
2451 const TargetRegisterClass *RRC = nullptr;
2452 uint8_t Cost = 1;
2453 switch (VT.SimpleTy) {
2454 default:
2455 return TargetLowering::findRepresentativeClass(TRI, VT);
2456 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2457 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2458 break;
2459 case MVT::x86mmx:
2460 RRC = &X86::VR64RegClass;
2461 break;
2462 case MVT::f32: case MVT::f64:
2463 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2464 case MVT::v4f32: case MVT::v2f64:
2465 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2466 case MVT::v8f32: case MVT::v4f64:
2467 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2468 case MVT::v16f32: case MVT::v8f64:
2469 RRC = &X86::VR128XRegClass;
2470 break;
2471 }
2472 return std::make_pair(RRC, Cost);
2473}
2474
2475unsigned X86TargetLowering::getAddressSpace() const {
2476 if (Subtarget.is64Bit())
2477 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2478 return 256;
2479}
2480
2481static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2482 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2483 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2484}
2485
2486static Constant* SegmentOffset(IRBuilder<> &IRB,
2487 unsigned Offset, unsigned AddressSpace) {
2488 return ConstantExpr::getIntToPtr(
2489 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2490 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2491}
2492
2493Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2494 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2495 // tcbhead_t; use it instead of the usual global variable (see
2496 // sysdeps/{i386,x86_64}/nptl/tls.h)
2497 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2498 if (Subtarget.isTargetFuchsia()) {
2499 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2500 return SegmentOffset(IRB, 0x10, getAddressSpace());
2501 } else {
2502 // %fs:0x28, unless we're using a Kernel code model, in which case
2503 // it's %gs:0x28. gs:0x14 on i386.
2504 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2505 return SegmentOffset(IRB, Offset, getAddressSpace());
2506 }
2507 }
2508
2509 return TargetLowering::getIRStackGuard(IRB);
2510}
2511
2512void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2513 // MSVC CRT provides functionalities for stack protection.
2514 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2515 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2516 // MSVC CRT has a global variable holding security cookie.
2517 M.getOrInsertGlobal("__security_cookie",
2518 Type::getInt8PtrTy(M.getContext()));
2519
2520 // MSVC CRT has a function to validate security cookie.
2521 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2522 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2523 Type::getInt8PtrTy(M.getContext()));
2524 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2525 F->setCallingConv(CallingConv::X86_FastCall);
2526 F->addAttribute(1, Attribute::AttrKind::InReg);
2527 }
2528 return;
2529 }
2530 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2531 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2532 return;
2533 TargetLowering::insertSSPDeclarations(M);
2534}
2535
2536Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2537 // MSVC CRT has a global variable holding security cookie.
2538 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540 return M.getGlobalVariable("__security_cookie");
2541 }
2542 return TargetLowering::getSDagStackGuard(M);
2543}
2544
2545Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2546 // MSVC CRT has a function to validate security cookie.
2547 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2548 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2549 return M.getFunction("__security_check_cookie");
2550 }
2551 return TargetLowering::getSSPStackGuardCheck(M);
2552}
2553
2554Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2555 if (Subtarget.getTargetTriple().isOSContiki())
2556 return getDefaultSafeStackPointerLocation(IRB, false);
2557
2558 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2559 // definition of TLS_SLOT_SAFESTACK in
2560 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2561 if (Subtarget.isTargetAndroid()) {
2562 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2563 // %gs:0x24 on i386
2564 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2565 return SegmentOffset(IRB, Offset, getAddressSpace());
2566 }
2567
2568 // Fuchsia is similar.
2569 if (Subtarget.isTargetFuchsia()) {
2570 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2571 return SegmentOffset(IRB, 0x18, getAddressSpace());
2572 }
2573
2574 return TargetLowering::getSafeStackPointerLocation(IRB);
2575}
2576
2577bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2578 unsigned DestAS) const {
2579 assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2579, __PRETTY_FUNCTION__))
;
2580
2581 const TargetMachine &TM = getTargetMachine();
2582 if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
2583 return false;
2584
2585 return SrcAS < 256 && DestAS < 256;
2586}
2587
2588//===----------------------------------------------------------------------===//
2589// Return Value Calling Convention Implementation
2590//===----------------------------------------------------------------------===//
2591
2592bool X86TargetLowering::CanLowerReturn(
2593 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2594 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2595 SmallVector<CCValAssign, 16> RVLocs;
2596 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2597 return CCInfo.CheckReturn(Outs, RetCC_X86);
2598}
2599
2600const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2601 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2602 return ScratchRegs;
2603}
2604
2605/// Lowers masks values (v*i1) to the local register values
2606/// \returns DAG node after lowering to register type
2607static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2608 const SDLoc &Dl, SelectionDAG &DAG) {
2609 EVT ValVT = ValArg.getValueType();
2610
2611 if (ValVT == MVT::v1i1)
2612 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2613 DAG.getIntPtrConstant(0, Dl));
2614
2615 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2616 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2617 // Two stage lowering might be required
2618 // bitcast: v8i1 -> i8 / v16i1 -> i16
2619 // anyextend: i8 -> i32 / i16 -> i32
2620 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2621 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2622 if (ValLoc == MVT::i32)
2623 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2624 return ValToCopy;
2625 }
2626
2627 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2628 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2629 // One stage lowering is required
2630 // bitcast: v32i1 -> i32 / v64i1 -> i64
2631 return DAG.getBitcast(ValLoc, ValArg);
2632 }
2633
2634 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2635}
2636
2637/// Breaks v64i1 value into two registers and adds the new node to the DAG
2638static void Passv64i1ArgInRegs(
2639 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2640 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
2641 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2642 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2642, __PRETTY_FUNCTION__))
;
2643 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2643, __PRETTY_FUNCTION__))
;
2644 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"
) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2644, __PRETTY_FUNCTION__))
;
2645 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2646, __PRETTY_FUNCTION__))
2646 "The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2646, __PRETTY_FUNCTION__))
;
2647
2648 // Before splitting the value we cast it to i64
2649 Arg = DAG.getBitcast(MVT::i64, Arg);
2650
2651 // Splitting the value into two i32 types
2652 SDValue Lo, Hi;
2653 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2654 DAG.getConstant(0, Dl, MVT::i32));
2655 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2656 DAG.getConstant(1, Dl, MVT::i32));
2657
2658 // Attach the two i32 types into corresponding registers
2659 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2660 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2661}
2662
2663SDValue
2664X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2665 bool isVarArg,
2666 const SmallVectorImpl<ISD::OutputArg> &Outs,
2667 const SmallVectorImpl<SDValue> &OutVals,
2668 const SDLoc &dl, SelectionDAG &DAG) const {
2669 MachineFunction &MF = DAG.getMachineFunction();
2670 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2671
2672 // In some cases we need to disable registers from the default CSR list.
2673 // For example, when they are used for argument passing.
2674 bool ShouldDisableCalleeSavedRegister =
2675 CallConv == CallingConv::X86_RegCall ||
2676 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2677
2678 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2679 report_fatal_error("X86 interrupts may not return any value");
2680
2681 SmallVector<CCValAssign, 16> RVLocs;
2682 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2683 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2684
2685 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
2686 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2687 ++I, ++OutsIndex) {
2688 CCValAssign &VA = RVLocs[I];
2689 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2689, __PRETTY_FUNCTION__))
;
2690
2691 // Add the register to the CalleeSaveDisableRegs list.
2692 if (ShouldDisableCalleeSavedRegister)
2693 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2694
2695 SDValue ValToCopy = OutVals[OutsIndex];
2696 EVT ValVT = ValToCopy.getValueType();
2697
2698 // Promote values to the appropriate types.
2699 if (VA.getLocInfo() == CCValAssign::SExt)
2700 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2701 else if (VA.getLocInfo() == CCValAssign::ZExt)
2702 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2703 else if (VA.getLocInfo() == CCValAssign::AExt) {
2704 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2705 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2706 else
2707 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2708 }
2709 else if (VA.getLocInfo() == CCValAssign::BCvt)
2710 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2711
2712 assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2713, __PRETTY_FUNCTION__))
2713 "Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2713, __PRETTY_FUNCTION__))
;
2714
2715 // Report an error if we have attempted to return a value via an XMM
2716 // register and SSE was disabled.
2717 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2718 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2719 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2720 } else if (!Subtarget.hasSSE2() &&
2721 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2722 ValVT == MVT::f64) {
2723 // When returning a double via an XMM register, report an error if SSE2 is
2724 // not enabled.
2725 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2726 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2727 }
2728
2729 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2730 // the RET instruction and handled by the FP Stackifier.
2731 if (VA.getLocReg() == X86::FP0 ||
2732 VA.getLocReg() == X86::FP1) {
2733 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2734 // change the value to the FP stack register class.
2735 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2736 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2737 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2738 // Don't emit a copytoreg.
2739 continue;
2740 }
2741
2742 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2743 // which is returned in RAX / RDX.
2744 if (Subtarget.is64Bit()) {
2745 if (ValVT == MVT::x86mmx) {
2746 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2747 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2748 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2749 ValToCopy);
2750 // If we don't have SSE2 available, convert to v4f32 so the generated
2751 // register is legal.
2752 if (!Subtarget.hasSSE2())
2753 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2754 }
2755 }
2756 }
2757
2758 if (VA.needsCustom()) {
2759 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2760, __PRETTY_FUNCTION__))
2760 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2760, __PRETTY_FUNCTION__))
;
2761
2762 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2763 Subtarget);
2764
2765 // Add the second register to the CalleeSaveDisableRegs list.
2766 if (ShouldDisableCalleeSavedRegister)
2767 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2768 } else {
2769 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2770 }
2771 }
2772
2773 SDValue Flag;
2774 SmallVector<SDValue, 6> RetOps;
2775 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2776 // Operand #1 = Bytes To Pop
2777 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2778 MVT::i32));
2779
2780 // Copy the result values into the output registers.
2781 for (auto &RetVal : RetVals) {
2782 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2783 RetOps.push_back(RetVal.second);
2784 continue; // Don't emit a copytoreg.
2785 }
2786
2787 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2788 Flag = Chain.getValue(1);
2789 RetOps.push_back(
2790 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2791 }
2792
2793 // Swift calling convention does not require we copy the sret argument
2794 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2795
2796 // All x86 ABIs require that for returning structs by value we copy
2797 // the sret argument into %rax/%eax (depending on ABI) for the return.
2798 // We saved the argument into a virtual register in the entry block,
2799 // so now we copy the value out and into %rax/%eax.
2800 //
2801 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2802 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2803 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2804 // either case FuncInfo->setSRetReturnReg() will have been called.
2805 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2806 // When we have both sret and another return value, we should use the
2807 // original Chain stored in RetOps[0], instead of the current Chain updated
2808 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2809
2810 // For the case of sret and another return value, we have
2811 // Chain_0 at the function entry
2812 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2813 // If we use Chain_1 in getCopyFromReg, we will have
2814 // Val = getCopyFromReg(Chain_1)
2815 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2816
2817 // getCopyToReg(Chain_0) will be glued together with
2818 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2819 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2820 // Data dependency from Unit B to Unit A due to usage of Val in
2821 // getCopyToReg(Chain_1, Val)
2822 // Chain dependency from Unit A to Unit B
2823
2824 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2825 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2826 getPointerTy(MF.getDataLayout()));
2827
2828 unsigned RetValReg
2829 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2830 X86::RAX : X86::EAX;
2831 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2832 Flag = Chain.getValue(1);
2833
2834 // RAX/EAX now acts like a return value.
2835 RetOps.push_back(
2836 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2837
2838 // Add the returned register to the CalleeSaveDisableRegs list.
2839 if (ShouldDisableCalleeSavedRegister)
2840 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2841 }
2842
2843 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2844 const MCPhysReg *I =
2845 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2846 if (I) {
2847 for (; *I; ++I) {
2848 if (X86::GR64RegClass.contains(*I))
2849 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2850 else
2851 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2851)
;
2852 }
2853 }
2854
2855 RetOps[0] = Chain; // Update chain.
2856
2857 // Add the flag if we have it.
2858 if (Flag.getNode())
2859 RetOps.push_back(Flag);
2860
2861 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2862 if (CallConv == CallingConv::X86_INTR)
2863 opcode = X86ISD::IRET;
2864 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2865}
2866
2867bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2868 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2869 return false;
2870
2871 SDValue TCChain = Chain;
2872 SDNode *Copy = *N->use_begin();
2873 if (Copy->getOpcode() == ISD::CopyToReg) {
2874 // If the copy has a glue operand, we conservatively assume it isn't safe to
2875 // perform a tail call.
2876 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2877 return false;
2878 TCChain = Copy->getOperand(0);
2879 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2880 return false;
2881
2882 bool HasRet = false;
2883 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2884 UI != UE; ++UI) {
2885 if (UI->getOpcode() != X86ISD::RET_FLAG)
2886 return false;
2887 // If we are returning more than one value, we can definitely
2888 // not make a tail call see PR19530
2889 if (UI->getNumOperands() > 4)
2890 return false;
2891 if (UI->getNumOperands() == 4 &&
2892 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2893 return false;
2894 HasRet = true;
2895 }
2896
2897 if (!HasRet)
2898 return false;
2899
2900 Chain = TCChain;
2901 return true;
2902}
2903
2904EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2905 ISD::NodeType ExtendKind) const {
2906 MVT ReturnMVT = MVT::i32;
2907
2908 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2909 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2910 // The ABI does not require i1, i8 or i16 to be extended.
2911 //
2912 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2913 // always extending i8/i16 return values, so keep doing that for now.
2914 // (PR26665).
2915 ReturnMVT = MVT::i8;
2916 }
2917
2918 EVT MinVT = getRegisterType(Context, ReturnMVT);
2919 return VT.bitsLT(MinVT) ? MinVT : VT;
2920}
2921
2922/// Reads two 32 bit registers and creates a 64 bit mask value.
2923/// \param VA The current 32 bit value that need to be assigned.
2924/// \param NextVA The next 32 bit value that need to be assigned.
2925/// \param Root The parent DAG node.
2926/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2927/// glue purposes. In the case the DAG is already using
2928/// physical register instead of virtual, we should glue
2929/// our new SDValue to InFlag SDvalue.
2930/// \return a new SDvalue of size 64bit.
2931static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2932 SDValue &Root, SelectionDAG &DAG,
2933 const SDLoc &Dl, const X86Subtarget &Subtarget,
2934 SDValue *InFlag = nullptr) {
2935 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2935, __PRETTY_FUNCTION__))
;
2936 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2936, __PRETTY_FUNCTION__))
;
2937 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2938, __PRETTY_FUNCTION__))
2938 "Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2938, __PRETTY_FUNCTION__))
;
2939 assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __PRETTY_FUNCTION__))
2940 "The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2940, __PRETTY_FUNCTION__))
;
2941 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2942, __PRETTY_FUNCTION__))
2942 "The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2942, __PRETTY_FUNCTION__))
;
2943
2944 SDValue Lo, Hi;
2945 SDValue ArgValueLo, ArgValueHi;
2946
2947 MachineFunction &MF = DAG.getMachineFunction();
2948 const TargetRegisterClass *RC = &X86::GR32RegClass;
2949
2950 // Read a 32 bit value from the registers.
2951 if (nullptr == InFlag) {
2952 // When no physical register is present,
2953 // create an intermediate virtual register.
2954 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2955 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2956 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2957 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2958 } else {
2959 // When a physical register is available read the value from it and glue
2960 // the reads together.
2961 ArgValueLo =
2962 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2963 *InFlag = ArgValueLo.getValue(2);
2964 ArgValueHi =
2965 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2966 *InFlag = ArgValueHi.getValue(2);
2967 }
2968
2969 // Convert the i32 type into v32i1 type.
2970 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2971
2972 // Convert the i32 type into v32i1 type.
2973 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2974
2975 // Concatenate the two values together.
2976 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2977}
2978
2979/// The function will lower a register of various sizes (8/16/32/64)
2980/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2981/// \returns a DAG node contains the operand after lowering to mask type.
2982static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2983 const EVT &ValLoc, const SDLoc &Dl,
2984 SelectionDAG &DAG) {
2985 SDValue ValReturned = ValArg;
2986
2987 if (ValVT == MVT::v1i1)
2988 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2989
2990 if (ValVT == MVT::v64i1) {
2991 // In 32 bit machine, this case is handled by getv64i1Argument
2992 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2992, __PRETTY_FUNCTION__))
;
2993 // In 64 bit machine, There is no need to truncate the value only bitcast
2994 } else {
2995 MVT maskLen;
2996 switch (ValVT.getSimpleVT().SimpleTy) {
2997 case MVT::v8i1:
2998 maskLen = MVT::i8;
2999 break;
3000 case MVT::v16i1:
3001 maskLen = MVT::i16;
3002 break;
3003 case MVT::v32i1:
3004 maskLen = MVT::i32;
3005 break;
3006 default:
3007 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3007)
;
3008 }
3009
3010 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3011 }
3012 return DAG.getBitcast(ValVT, ValReturned);
3013}
3014
3015/// Lower the result values of a call into the
3016/// appropriate copies out of appropriate physical registers.
3017///
3018SDValue X86TargetLowering::LowerCallResult(
3019 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3020 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3021 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3022 uint32_t *RegMask) const {
3023
3024 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3025 // Assign locations to each value returned by this call.
3026 SmallVector<CCValAssign, 16> RVLocs;
3027 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3028 *DAG.getContext());
3029 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3030
3031 // Copy all of the result registers out of their specified physreg.
3032 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3033 ++I, ++InsIndex) {
3034 CCValAssign &VA = RVLocs[I];
3035 EVT CopyVT = VA.getLocVT();
3036
3037 // In some calling conventions we need to remove the used registers
3038 // from the register mask.
3039 if (RegMask) {
3040 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3041 SubRegs.isValid(); ++SubRegs)
3042 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3043 }
3044
3045 // Report an error if there was an attempt to return FP values via XMM
3046 // registers.
3047 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3048 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3049 if (VA.getLocReg() == X86::XMM1)
3050 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3051 else
3052 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3053 } else if (!Subtarget.hasSSE2() &&
3054 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3055 CopyVT == MVT::f64) {
3056 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3057 if (VA.getLocReg() == X86::XMM1)
3058 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3059 else
3060 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3061 }
3062
3063 // If we prefer to use the value in xmm registers, copy it out as f80 and
3064 // use a truncate to move it from fp stack reg to xmm reg.
3065 bool RoundAfterCopy = false;
3066 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3067 isScalarFPTypeInSSEReg(VA.getValVT())) {
3068 if (!Subtarget.hasX87())
3069 report_fatal_error("X87 register return with X87 disabled");
3070 CopyVT = MVT::f80;
3071 RoundAfterCopy = (CopyVT != VA.getLocVT());
3072 }
3073
3074 SDValue Val;
3075 if (VA.needsCustom()) {
3076 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3077, __PRETTY_FUNCTION__))
3077 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3077, __PRETTY_FUNCTION__))
;
3078 Val =
3079 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3080 } else {
3081 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3082 .getValue(1);
3083 Val = Chain.getValue(0);
3084 InFlag = Chain.getValue(2);
3085 }
3086
3087 if (RoundAfterCopy)
3088 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3089 // This truncation won't change the value.
3090 DAG.getIntPtrConstant(1, dl));
3091
3092 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
3093 if (VA.getValVT().isVector() &&
3094 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3095 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3096 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3097 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3098 } else
3099 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3100 }
3101
3102 if (VA.getLocInfo() == CCValAssign::BCvt)
3103 Val = DAG.getBitcast(VA.getValVT(), Val);
3104
3105 InVals.push_back(Val);
3106 }
3107
3108 return Chain;
3109}
3110
3111//===----------------------------------------------------------------------===//
3112// C & StdCall & Fast Calling Convention implementation
3113//===----------------------------------------------------------------------===//
3114// StdCall calling convention seems to be standard for many Windows' API
3115// routines and around. It differs from C calling convention just a little:
3116// callee should clean up the stack, not caller. Symbols should be also
3117// decorated in some fancy way :) It doesn't support any vector arguments.
3118// For info on fast calling convention see Fast Calling Convention (tail call)
3119// implementation LowerX86_32FastCCCallTo.
3120
3121/// CallIsStructReturn - Determines whether a call uses struct return
3122/// semantics.
3123enum StructReturnType {
3124 NotStructReturn,
3125 RegStructReturn,
3126 StackStructReturn
3127};
3128static StructReturnType
3129callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3130 if (Outs.empty())
3131 return NotStructReturn;
3132
3133 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3134 if (!Flags.isSRet())
3135 return NotStructReturn;
3136 if (Flags.isInReg() || IsMCU)
3137 return RegStructReturn;
3138 return StackStructReturn;
3139}
3140
3141/// Determines whether a function uses struct return semantics.
3142static StructReturnType
3143argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3144 if (Ins.empty())
3145 return NotStructReturn;
3146
3147 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3148 if (!Flags.isSRet())
3149 return NotStructReturn;
3150 if (Flags.isInReg() || IsMCU)
3151 return RegStructReturn;
3152 return StackStructReturn;
3153}
3154
3155/// Make a copy of an aggregate at address specified by "Src" to address
3156/// "Dst" with size and alignment information specified by the specific
3157/// parameter attribute. The copy will be passed as a byval function parameter.
3158static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3159 SDValue Chain, ISD::ArgFlagsTy Flags,
3160 SelectionDAG &DAG, const SDLoc &dl) {
3161 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
3162
3163 return DAG.getMemcpy(
3164 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3165 /*isVolatile*/ false, /*AlwaysInline=*/true,
3166 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3167}
3168
3169/// Return true if the calling convention is one that we can guarantee TCO for.
3170static bool canGuaranteeTCO(CallingConv::ID CC) {
3171 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3172 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3173 CC == CallingConv::HHVM || CC == CallingConv::Tail);
3174}
3175
3176/// Return true if we might ever do TCO for calls with this calling convention.
3177static bool mayTailCallThisCC(CallingConv::ID CC) {
3178 switch (CC) {
3179 // C calling conventions:
3180 case CallingConv::C:
3181 case CallingConv::Win64:
3182 case CallingConv::X86_64_SysV:
3183 // Callee pop conventions:
3184 case CallingConv::X86_ThisCall:
3185 case CallingConv::X86_StdCall:
3186 case CallingConv::X86_VectorCall:
3187 case CallingConv::X86_FastCall:
3188 // Swift:
3189 case CallingConv::Swift:
3190 return true;
3191 default:
3192 return canGuaranteeTCO(CC);
3193 }
3194}
3195
3196/// Return true if the function is being made into a tailcall target by
3197/// changing its ABI.
3198static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3199 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
3200}
3201
3202bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3203 if (!CI->isTailCall())
3204 return false;
3205
3206 ImmutableCallSite CS(CI);
3207 CallingConv::ID CalleeCC = CS.getCallingConv();
3208 if (!mayTailCallThisCC(CalleeCC))
3209 return false;
3210
3211 return true;
3212}
3213
3214SDValue
3215X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3216 const SmallVectorImpl<ISD::InputArg> &Ins,
3217 const SDLoc &dl, SelectionDAG &DAG,
3218 const CCValAssign &VA,
3219 MachineFrameInfo &MFI, unsigned i) const {
3220 // Create the nodes corresponding to a load from this parameter slot.
3221 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3222 bool AlwaysUseMutable = shouldGuaranteeTCO(
3223 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3224 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3225 EVT ValVT;
3226 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3227
3228 // If value is passed by pointer we have address passed instead of the value
3229 // itself. No need to extend if the mask value and location share the same
3230 // absolute size.
3231 bool ExtendedInMem =
3232 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3233 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3234
3235 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3236 ValVT = VA.getLocVT();
3237 else
3238 ValVT = VA.getValVT();
3239
3240 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3241 // changed with more analysis.
3242 // In case of tail call optimization mark all arguments mutable. Since they
3243 // could be overwritten by lowering of arguments in case of a tail call.
3244 if (Flags.isByVal()) {
3245 unsigned Bytes = Flags.getByValSize();
3246 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3247
3248 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3249 // can be improved with deeper analysis.
3250 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3251 /*isAliased=*/true);
3252 return DAG.getFrameIndex(FI, PtrVT);
3253 }
3254
3255 // This is an argument in memory. We might be able to perform copy elision.
3256 // If the argument is passed directly in memory without any extension, then we
3257 // can perform copy elision. Large vector types, for example, may be passed
3258 // indirectly by pointer.
3259 if (Flags.isCopyElisionCandidate() &&
3260 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
3261 EVT ArgVT = Ins[i].ArgVT;
3262 SDValue PartAddr;
3263 if (Ins[i].PartOffset == 0) {
3264 // If this is a one-part value or the first part of a multi-part value,
3265 // create a stack object for the entire argument value type and return a
3266 // load from our portion of it. This assumes that if the first part of an
3267 // argument is in memory, the rest will also be in memory.
3268 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3269 /*IsImmutable=*/false);
3270 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3271 return DAG.getLoad(
3272 ValVT, dl, Chain, PartAddr,
3273 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3274 } else {
3275 // This is not the first piece of an argument in memory. See if there is
3276 // already a fixed stack object including this offset. If so, assume it
3277 // was created by the PartOffset == 0 branch above and create a load from
3278 // the appropriate offset into it.
3279 int64_t PartBegin = VA.getLocMemOffset();
3280 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3281 int FI = MFI.getObjectIndexBegin();
3282 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3283 int64_t ObjBegin = MFI.getObjectOffset(FI);
3284 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3285 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3286 break;
3287 }
3288 if (MFI.isFixedObjectIndex(FI)) {
3289 SDValue Addr =
3290 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3291 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3292 return DAG.getLoad(
3293 ValVT, dl, Chain, Addr,
3294 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3295 Ins[i].PartOffset));
3296 }
3297 }
3298 }
3299
3300 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3301 VA.getLocMemOffset(), isImmutable);
3302
3303 // Set SExt or ZExt flag.
3304 if (VA.getLocInfo() == CCValAssign::ZExt) {
3305 MFI.setObjectZExt(FI, true);
3306 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3307 MFI.setObjectSExt(FI, true);
3308 }
3309
3310 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3311 SDValue Val = DAG.getLoad(
3312 ValVT, dl, Chain, FIN,
3313 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3314 return ExtendedInMem
3315 ? (VA.getValVT().isVector()
3316 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3317 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3318 : Val;
3319}
3320
3321// FIXME: Get this from tablegen.
3322static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3323 const X86Subtarget &Subtarget) {
3324 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3324, __PRETTY_FUNCTION__))
;
3325
3326 if (Subtarget.isCallingConvWin64(CallConv)) {
3327 static const MCPhysReg GPR64ArgRegsWin64[] = {
3328 X86::RCX, X86::RDX, X86::R8, X86::R9
3329 };
3330 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3331 }
3332
3333 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3334 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3335 };
3336 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3337}
3338
3339// FIXME: Get this from tablegen.
3340static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3341 CallingConv::ID CallConv,
3342 const X86Subtarget &Subtarget) {
3343 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3343, __PRETTY_FUNCTION__))
;
3344 if (Subtarget.isCallingConvWin64(CallConv)) {
3345 // The XMM registers which might contain var arg parameters are shadowed
3346 // in their paired GPR. So we only need to save the GPR to their home
3347 // slots.
3348 // TODO: __vectorcall will change this.
3349 return None;
3350 }
3351
3352 const Function &F = MF.getFunction();
3353 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3354 bool isSoftFloat = Subtarget.useSoftFloat();
3355 assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3356, __PRETTY_FUNCTION__))
3356 "SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3356, __PRETTY_FUNCTION__))
;
3357 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3358 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3359 // registers.
3360 return None;
3361
3362 static const MCPhysReg XMMArgRegs64Bit[] = {
3363 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3364 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3365 };
3366 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3367}
3368
3369#ifndef NDEBUG
3370static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3371 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
3372 [](const CCValAssign &A, const CCValAssign &B) -> bool {
3373 return A.getValNo() < B.getValNo();
3374 });
3375}
3376#endif
3377
3378SDValue X86TargetLowering::LowerFormalArguments(
3379 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3380 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3381 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3382 MachineFunction &MF = DAG.getMachineFunction();
3383 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3384 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3385
3386 const Function &F = MF.getFunction();
3387 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3388 F.getName() == "main")
3389 FuncInfo->setForceFramePointer(true);
3390
3391 MachineFrameInfo &MFI = MF.getFrameInfo();
3392 bool Is64Bit = Subtarget.is64Bit();
3393 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3394
3395 assert(((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3397, __PRETTY_FUNCTION__))
3396 !(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3397, __PRETTY_FUNCTION__))
3397 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3397, __PRETTY_FUNCTION__))
;
3398
3399 // Assign locations to all of the incoming arguments.
3400 SmallVector<CCValAssign, 16> ArgLocs;
3401 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3402
3403 // Allocate shadow area for Win64.
3404 if (IsWin64)
3405 CCInfo.AllocateStack(32, 8);
3406
3407 CCInfo.AnalyzeArguments(Ins, CC_X86);
3408
3409 // In vectorcall calling convention a second pass is required for the HVA
3410 // types.
3411 if (CallingConv::X86_VectorCall == CallConv) {
3412 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3413 }
3414
3415 // The next loop assumes that the locations are in the same order of the
3416 // input arguments.
3417 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3418, __PRETTY_FUNCTION__))
3418 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3418, __PRETTY_FUNCTION__))
;
3419
3420 SDValue ArgValue;
3421 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3422 ++I, ++InsIndex) {
3423 assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast
<void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3423, __PRETTY_FUNCTION__))
;
3424 CCValAssign &VA = ArgLocs[I];
3425
3426 if (VA.isRegLoc()) {
3427 EVT RegVT = VA.getLocVT();
3428 if (VA.needsCustom()) {
3429 assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
3430 VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
3431 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
;
3432
3433 // v64i1 values, in regcall calling convention, that are
3434 // compiled to 32 bit arch, are split up into two registers.
3435 ArgValue =
3436 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3437 } else {
3438 const TargetRegisterClass *RC;
3439 if (RegVT == MVT::i8)
3440 RC = &X86::GR8RegClass;
3441 else if (RegVT == MVT::i16)
3442 RC = &X86::GR16RegClass;
3443 else if (RegVT == MVT::i32)
3444 RC = &X86::GR32RegClass;
3445 else if (Is64Bit && RegVT == MVT::i64)
3446 RC = &X86::GR64RegClass;
3447 else if (RegVT == MVT::f32)
3448 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3449 else if (RegVT == MVT::f64)
3450 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3451 else if (RegVT == MVT::f80)
3452 RC = &X86::RFP80RegClass;
3453 else if (RegVT == MVT::f128)
3454 RC = &X86::VR128RegClass;
3455 else if (RegVT.is512BitVector())
3456 RC = &X86::VR512RegClass;
3457 else if (RegVT.is256BitVector())
3458 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3459 else if (RegVT.is128BitVector())
3460 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3461 else if (RegVT == MVT::x86mmx)
3462 RC = &X86::VR64RegClass;
3463 else if (RegVT == MVT::v1i1)
3464 RC = &X86::VK1RegClass;
3465 else if (RegVT == MVT::v8i1)
3466 RC = &X86::VK8RegClass;
3467 else if (RegVT == MVT::v16i1)
3468 RC = &X86::VK16RegClass;
3469 else if (RegVT == MVT::v32i1)
3470 RC = &X86::VK32RegClass;
3471 else if (RegVT == MVT::v64i1)
3472 RC = &X86::VK64RegClass;
3473 else
3474 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3474)
;
3475
3476 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3477 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3478 }
3479
3480 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3481 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3482 // right size.
3483 if (VA.getLocInfo() == CCValAssign::SExt)
3484 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3485 DAG.getValueType(VA.getValVT()));
3486 else if (VA.getLocInfo() == CCValAssign::ZExt)
3487 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3488 DAG.getValueType(VA.getValVT()));
3489 else if (VA.getLocInfo() == CCValAssign::BCvt)
3490 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3491
3492 if (VA.isExtInLoc()) {
3493 // Handle MMX values passed in XMM regs.
3494 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3495 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3496 else if (VA.getValVT().isVector() &&
3497 VA.getValVT().getScalarType() == MVT::i1 &&
3498 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3499 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3500 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3501 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3502 } else
3503 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3504 }
3505 } else {
3506 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3506, __PRETTY_FUNCTION__))
;
3507 ArgValue =
3508 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3509 }
3510
3511 // If value is passed via pointer - do a load.
3512 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3513 ArgValue =
3514 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3515
3516 InVals.push_back(ArgValue);
3517 }
3518
3519 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3520 // Swift calling convention does not require we copy the sret argument
3521 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3522 if (CallConv == CallingConv::Swift)
3523 continue;
3524
3525 // All x86 ABIs require that for returning structs by value we copy the
3526 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3527 // the argument into a virtual register so that we can access it from the
3528 // return points.
3529 if (Ins[I].Flags.isSRet()) {
3530 unsigned Reg = FuncInfo->getSRetReturnReg();
3531 if (!Reg) {
3532 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3533 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3534 FuncInfo->setSRetReturnReg(Reg);
3535 }
3536 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3537 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3538 break;
3539 }
3540 }
3541
3542 unsigned StackSize = CCInfo.getNextStackOffset();
3543 // Align stack specially for tail calls.
3544 if (shouldGuaranteeTCO(CallConv,
3545 MF.getTarget().Options.GuaranteedTailCallOpt))
3546 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3547
3548 // If the function takes variable number of arguments, make a frame index for
3549 // the start of the first vararg value... for expansion of llvm.va_start. We
3550 // can skip this if there are no va_start calls.
3551 if (MFI.hasVAStart() &&
3552 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3553 CallConv != CallingConv::X86_ThisCall))) {
3554 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3555 }
3556
3557 // Figure out if XMM registers are in use.
3558 assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3560, __PRETTY_FUNCTION__))
3559 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3560, __PRETTY_FUNCTION__))
3560 "SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3560, __PRETTY_FUNCTION__))
;
3561
3562 // 64-bit calling conventions support varargs and register parameters, so we
3563 // have to do extra work to spill them in the prologue.
3564 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3565 // Find the first unallocated argument registers.
3566 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3567 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3568 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3569 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3570 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3571, __PRETTY_FUNCTION__))
3571 "SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3571, __PRETTY_FUNCTION__))
;
3572
3573 // Gather all the live in physical registers.
3574 SmallVector<SDValue, 6> LiveGPRs;
3575 SmallVector<SDValue, 8> LiveXMMRegs;
3576 SDValue ALVal;
3577 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3578 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3579 LiveGPRs.push_back(
3580 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3581 }
3582 if (!ArgXMMs.empty()) {
3583 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3584 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3585 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3586 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3587 LiveXMMRegs.push_back(
3588 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3589 }
3590 }
3591
3592 if (IsWin64) {
3593 // Get to the caller-allocated home save location. Add 8 to account
3594 // for the return address.
3595 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3596 FuncInfo->setRegSaveFrameIndex(
3597 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3598 // Fixup to set vararg frame on shadow area (4 x i64).
3599 if (NumIntRegs < 4)
3600 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3601 } else {
3602 // For X86-64, if there are vararg parameters that are passed via
3603 // registers, then we must store them to their spots on the stack so
3604 // they may be loaded by dereferencing the result of va_next.
3605 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3606 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3607 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3608 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3609 }
3610
3611 // Store the integer parameter registers.
3612 SmallVector<SDValue, 8> MemOps;
3613 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3614 getPointerTy(DAG.getDataLayout()));
3615 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3616 for (SDValue Val : LiveGPRs) {
3617 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3618 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3619 SDValue Store =
3620 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3621 MachinePointerInfo::getFixedStack(
3622 DAG.getMachineFunction(),
3623 FuncInfo->getRegSaveFrameIndex(), Offset));
3624 MemOps.push_back(Store);
3625 Offset += 8;
3626 }
3627
3628 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3629 // Now store the XMM (fp + vector) parameter registers.
3630 SmallVector<SDValue, 12> SaveXMMOps;
3631 SaveXMMOps.push_back(Chain);
3632 SaveXMMOps.push_back(ALVal);
3633 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3634 FuncInfo->getRegSaveFrameIndex(), dl));
3635 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3636 FuncInfo->getVarArgsFPOffset(), dl));
3637 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3638 LiveXMMRegs.end());
3639 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3640 MVT::Other, SaveXMMOps));
3641 }
3642
3643 if (!MemOps.empty())
3644 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3645 }
3646
3647 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3648 // Find the largest legal vector type.
3649 MVT VecVT = MVT::Other;
3650 // FIXME: Only some x86_32 calling conventions support AVX512.
3651 if (Subtarget.useAVX512Regs() &&
3652 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3653 CallConv == CallingConv::Intel_OCL_BI)))
3654 VecVT = MVT::v16f32;
3655 else if (Subtarget.hasAVX())
3656 VecVT = MVT::v8f32;
3657 else if (Subtarget.hasSSE2())
3658 VecVT = MVT::v4f32;
3659
3660 // We forward some GPRs and some vector types.
3661 SmallVector<MVT, 2> RegParmTypes;
3662 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3663 RegParmTypes.push_back(IntVT);
3664 if (VecVT != MVT::Other)
3665 RegParmTypes.push_back(VecVT);
3666
3667 // Compute the set of forwarded registers. The rest are scratch.
3668 SmallVectorImpl<ForwardedRegister> &Forwards =
3669 FuncInfo->getForwardedMustTailRegParms();
3670 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3671
3672 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3673 if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
3674 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3675 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3676 }
3677
3678 // Copy all forwards from physical to virtual registers.
3679 for (ForwardedRegister &FR : Forwards) {
3680 // FIXME: Can we use a less constrained schedule?
3681 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
3682 FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
3683 Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
3684 }
3685 }
3686
3687 // Some CCs need callee pop.
3688 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3689 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3690 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3691 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3692 // X86 interrupts must pop the error code (and the alignment padding) if
3693 // present.
3694 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3695 } else {
3696 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3697 // If this is an sret function, the return should pop the hidden pointer.
3698 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3699 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3700 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3701 FuncInfo->setBytesToPopOnReturn(4);
3702 }
3703
3704 if (!Is64Bit) {
3705 // RegSaveFrameIndex is X86-64 only.
3706 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3707 if (CallConv == CallingConv::X86_FastCall ||
3708 CallConv == CallingConv::X86_ThisCall)
3709 // fastcc functions can't have varargs.
3710 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3711 }
3712
3713 FuncInfo->setArgumentStackSize(StackSize);
3714
3715 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3716 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3717 if (Personality == EHPersonality::CoreCLR) {
3718 assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3718, __PRETTY_FUNCTION__))
;
3719 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3720 // that we'd prefer this slot be allocated towards the bottom of the frame
3721 // (i.e. near the stack pointer after allocating the frame). Every
3722 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3723 // offset from the bottom of this and each funclet's frame must be the
3724 // same, so the size of funclets' (mostly empty) frames is dictated by
3725 // how far this slot is from the bottom (since they allocate just enough
3726 // space to accommodate holding this slot at the correct offset).
3727 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3728 EHInfo->PSPSymFrameIdx = PSPSymFI;
3729 }
3730 }
3731
3732 if (CallConv == CallingConv::X86_RegCall ||
3733 F.hasFnAttribute("no_caller_saved_registers")) {
3734 MachineRegisterInfo &MRI = MF.getRegInfo();
3735 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3736 MRI.disableCalleeSavedRegister(Pair.first);
3737 }
3738
3739 return Chain;
3740}
3741
3742SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3743 SDValue Arg, const SDLoc &dl,
3744 SelectionDAG &DAG,
3745 const CCValAssign &VA,
3746 ISD::ArgFlagsTy Flags) const {
3747 unsigned LocMemOffset = VA.getLocMemOffset();
3748 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3749 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3750 StackPtr, PtrOff);
3751 if (Flags.isByVal())
3752 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3753
3754 return DAG.getStore(
3755 Chain, dl, Arg, PtrOff,
3756 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3757}
3758
3759/// Emit a load of return address if tail call
3760/// optimization is performed and it is required.
3761SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3762 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3763 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3764 // Adjust the Return address stack slot.
3765 EVT VT = getPointerTy(DAG.getDataLayout());
3766 OutRetAddr = getReturnAddressFrameIndex(DAG);
3767
3768 // Load the "old" Return address.
3769 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3770 return SDValue(OutRetAddr.getNode(), 1);
3771}
3772
3773/// Emit a store of the return address if tail call
3774/// optimization is performed and it is required (FPDiff!=0).
3775static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3776 SDValue Chain, SDValue RetAddrFrIdx,
3777 EVT PtrVT, unsigned SlotSize,
3778 int FPDiff, const SDLoc &dl) {
3779 // Store the return address to the appropriate stack slot.
3780 if (!FPDiff) return Chain;
3781 // Calculate the new stack slot for the return address.
3782 int NewReturnAddrFI =
3783 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3784 false);
3785 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3786 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3787 MachinePointerInfo::getFixedStack(
3788 DAG.getMachineFunction(), NewReturnAddrFI));
3789 return Chain;
3790}
3791
3792/// Returns a vector_shuffle mask for an movs{s|d}, movd
3793/// operation of specified width.
3794static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3795 SDValue V2) {
3796 unsigned NumElems = VT.getVectorNumElements();
3797 SmallVector<int, 8> Mask;
3798 Mask.push_back(NumElems);
3799 for (unsigned i = 1; i != NumElems; ++i)
3800 Mask.push_back(i);
3801 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3802}
3803
3804SDValue
3805X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3806 SmallVectorImpl<SDValue> &InVals) const {
3807 SelectionDAG &DAG = CLI.DAG;
3808 SDLoc &dl = CLI.DL;
3809 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3810 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3811 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3812 SDValue Chain = CLI.Chain;
3813 SDValue Callee = CLI.Callee;
3814 CallingConv::ID CallConv = CLI.CallConv;
3815 bool &isTailCall = CLI.IsTailCall;
3816 bool isVarArg = CLI.IsVarArg;
3817
3818 MachineFunction &MF = DAG.getMachineFunction();
3819 bool Is64Bit = Subtarget.is64Bit();
3820 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3821 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3822 bool IsSibcall = false;
3823 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3824 CallConv == CallingConv::Tail;
3825 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3826 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3827 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3828 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3829 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3830 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3831 bool HasNoCfCheck =
3832 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3833 const Module *M = MF.getMMI().getModule();
3834 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3835
3836 MachineFunction::CallSiteInfo CSInfo;
3837
3838 if (CallConv == CallingConv::X86_INTR)
3839 report_fatal_error("X86 interrupts may not be called directly");
3840
3841 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3842 // If we are using a GOT, disable tail calls to external symbols with
3843 // default visibility. Tail calling such a symbol requires using a GOT
3844 // relocation, which forces early binding of the symbol. This breaks code
3845 // that require lazy function symbol resolution. Using musttail or
3846 // GuaranteedTailCallOpt will override this.
3847 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3848 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3849 G->getGlobal()->hasDefaultVisibility()))
3850 isTailCall = false;
3851 }
3852
3853 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3854 if (IsMustTail) {
3855 // Force this to be a tail call. The verifier rules are enough to ensure
3856 // that we can lower this successfully without moving the return address
3857 // around.
3858 isTailCall = true;
3859 } else if (isTailCall) {
3860 // Check if it's really possible to do a tail call.
3861 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3862 isVarArg, SR != NotStructReturn,
3863 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3864 Outs, OutVals, Ins, DAG);
3865
3866 // Sibcalls are automatically detected tailcalls which do not require
3867 // ABI changes.
3868 if (!IsGuaranteeTCO && isTailCall)
3869 IsSibcall = true;
3870
3871 if (isTailCall)
3872 ++NumTailCalls;
3873 }
3874
3875 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3876, __PRETTY_FUNCTION__))
3876 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3876, __PRETTY_FUNCTION__))
;
3877
3878 // Analyze operands of the call, assigning locations to each operand.
3879 SmallVector<CCValAssign, 16> ArgLocs;
3880 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3881
3882 // Allocate shadow area for Win64.
3883 if (IsWin64)
3884 CCInfo.AllocateStack(32, 8);
3885
3886 CCInfo.AnalyzeArguments(Outs, CC_X86);
3887
3888 // In vectorcall calling convention a second pass is required for the HVA
3889 // types.
3890 if (CallingConv::X86_VectorCall == CallConv) {
3891 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3892 }
3893
3894 // Get a count of how many bytes are to be pushed on the stack.
3895 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3896 if (IsSibcall)
3897 // This is a sibcall. The memory operands are available in caller's
3898 // own caller's stack.
3899 NumBytes = 0;
3900 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
3901 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3902
3903 int FPDiff = 0;
3904 if (isTailCall && !IsSibcall && !IsMustTail) {
3905 // Lower arguments at fp - stackoffset + fpdiff.
3906 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3907
3908 FPDiff = NumBytesCallerPushed - NumBytes;
3909
3910 // Set the delta of movement of the returnaddr stackslot.
3911 // But only set if delta is greater than previous delta.
3912 if (FPDiff < X86Info->getTCReturnAddrDelta())
3913 X86Info->setTCReturnAddrDelta(FPDiff);
3914 }
3915
3916 unsigned NumBytesToPush = NumBytes;
3917 unsigned NumBytesToPop = NumBytes;
3918
3919 // If we have an inalloca argument, all stack space has already been allocated
3920 // for us and be right at the top of the stack. We don't support multiple
3921 // arguments passed in memory when using inalloca.
3922 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3923 NumBytesToPush = 0;
3924 if (!ArgLocs.back().isMemLoc())
3925 report_fatal_error("cannot use inalloca attribute on a register "
3926 "parameter");
3927 if (ArgLocs.back().getLocMemOffset() != 0)
3928 report_fatal_error("any parameter with the inalloca attribute must be "
3929 "the only memory argument");
3930 }
3931
3932 if (!IsSibcall && !IsMustTail)
3933 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3934 NumBytes - NumBytesToPush, dl);
3935
3936 SDValue RetAddrFrIdx;
3937 // Load return address for tail calls.
3938 if (isTailCall && FPDiff)
3939 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3940 Is64Bit, FPDiff, dl);
3941
3942 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3943 SmallVector<SDValue, 8> MemOpChains;
3944 SDValue StackPtr;
3945
3946 // The next loop assumes that the locations are in the same order of the
3947 // input arguments.
3948 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3949, __PRETTY_FUNCTION__))
3949 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3949, __PRETTY_FUNCTION__))
;
3950
3951 // Walk the register/memloc assignments, inserting copies/loads. In the case
3952 // of tail call optimization arguments are handle later.
3953 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3954 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3955 ++I, ++OutIndex) {
3956 assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ?
static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3956, __PRETTY_FUNCTION__))
;
3957 // Skip inalloca arguments, they have already been written.
3958 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3959 if (Flags.isInAlloca())
3960 continue;
3961
3962 CCValAssign &VA = ArgLocs[I];
3963 EVT RegVT = VA.getLocVT();
3964 SDValue Arg = OutVals[OutIndex];
3965 bool isByVal = Flags.isByVal();
3966
3967 // Promote the value if needed.
3968 switch (VA.getLocInfo()) {
3969 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3969)
;
3970 case CCValAssign::Full: break;
3971 case CCValAssign::SExt:
3972 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3973 break;
3974 case CCValAssign::ZExt:
3975 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3976 break;
3977 case CCValAssign::AExt:
3978 if (Arg.getValueType().isVector() &&
3979 Arg.getValueType().getVectorElementType() == MVT::i1)
3980 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3981 else if (RegVT.is128BitVector()) {
3982 // Special case: passing MMX values in XMM registers.
3983 Arg = DAG.getBitcast(MVT::i64, Arg);
3984 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3985 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3986 } else
3987 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3988 break;
3989 case CCValAssign::BCvt:
3990 Arg = DAG.getBitcast(RegVT, Arg);
3991 break;
3992 case CCValAssign::Indirect: {
3993 if (isByVal) {
3994 // Memcpy the argument to a temporary stack slot to prevent
3995 // the caller from seeing any modifications the callee may make
3996 // as guaranteed by the `byval` attribute.
3997 int FrameIdx = MF.getFrameInfo().CreateStackObject(
3998 Flags.getByValSize(),
3999 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4000 SDValue StackSlot =
4001 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4002 Chain =
4003 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4004 // From now on treat this as a regular pointer
4005 Arg = StackSlot;
4006 isByVal = false;
4007 } else {
4008 // Store the argument.
4009 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4010 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4011 Chain = DAG.getStore(
4012 Chain, dl, Arg, SpillSlot,
4013 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4014 Arg = SpillSlot;
4015 }
4016 break;
4017 }
4018 }
4019
4020 if (VA.needsCustom()) {
4021 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4022, __PRETTY_FUNCTION__))
4022 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4022, __PRETTY_FUNCTION__))
;
4023 // Split v64i1 value into two registers
4024 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4025 } else if (VA.isRegLoc()) {
4026 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4027 const TargetOptions &Options = DAG.getTarget().Options;
4028 if (Options.EnableDebugEntryValues)
4029 CSInfo.emplace_back(VA.getLocReg(), I);
4030 if (isVarArg && IsWin64) {
4031 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4032 // shadow reg if callee is a varargs function.
4033 unsigned ShadowReg = 0;
4034 switch (VA.getLocReg()) {
4035 case X86::XMM0: ShadowReg = X86::RCX; break;
4036 case X86::XMM1: ShadowReg = X86::RDX; break;
4037 case X86::XMM2: ShadowReg = X86::R8; break;
4038 case X86::XMM3: ShadowReg = X86::R9; break;
4039 }
4040 if (ShadowReg)
4041 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4042 }
4043 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4044 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4044, __PRETTY_FUNCTION__))
;
4045 if (!StackPtr.getNode())
4046 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4047 getPointerTy(DAG.getDataLayout()));
4048 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4049 dl, DAG, VA, Flags));
4050 }
4051 }
4052
4053 if (!MemOpChains.empty())
4054 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4055
4056 if (Subtarget.isPICStyleGOT()) {
4057 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4058 // GOT pointer.
4059 if (!isTailCall) {
4060 RegsToPass.push_back(std::make_pair(
4061 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4062 getPointerTy(DAG.getDataLayout()))));
4063 } else {
4064 // If we are tail calling and generating PIC/GOT style code load the
4065 // address of the callee into ECX. The value in ecx is used as target of
4066 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4067 // for tail calls on PIC/GOT architectures. Normally we would just put the
4068 // address of GOT into ebx and then call target@PLT. But for tail calls
4069 // ebx would be restored (since ebx is callee saved) before jumping to the
4070 // target@PLT.
4071
4072 // Note: The actual moving to ECX is done further down.
4073 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4074 if (G && !G->getGlobal()->hasLocalLinkage() &&
4075 G->getGlobal()->hasDefaultVisibility())
4076 Callee = LowerGlobalAddress(Callee, DAG);
4077 else if (isa<ExternalSymbolSDNode>(Callee))
4078 Callee = LowerExternalSymbol(Callee, DAG);
4079 }
4080 }
4081
4082 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4083 // From AMD64 ABI document:
4084 // For calls that may call functions that use varargs or stdargs
4085 // (prototype-less calls or calls to functions containing ellipsis (...) in
4086 // the declaration) %al is used as hidden argument to specify the number
4087 // of SSE registers used. The contents of %al do not need to match exactly
4088 // the number of registers, but must be an ubound on the number of SSE
4089 // registers used and is in the range 0 - 8 inclusive.
4090
4091 // Count the number of XMM registers allocated.
4092 static const MCPhysReg XMMArgRegs[] = {
4093 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4094 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4095 };
4096 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4097 assert((Subtarget.hasSSE1() || !NumXMMRegs)(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4098, __PRETTY_FUNCTION__))
4098 && "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4098, __PRETTY_FUNCTION__))
;
4099
4100 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
4101 DAG.getConstant(NumXMMRegs, dl,
4102 MVT::i8)));
4103 }
4104
4105 if (isVarArg && IsMustTail) {
4106 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4107 for (const auto &F : Forwards) {
4108 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4109 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
4110 }
4111 }
4112
4113 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4114 // don't need this because the eligibility check rejects calls that require
4115 // shuffling arguments passed in memory.
4116 if (!IsSibcall && isTailCall) {
4117 // Force all the incoming stack arguments to be loaded from the stack
4118 // before any new outgoing arguments are stored to the stack, because the
4119 // outgoing stack slots may alias the incoming argument stack slots, and
4120 // the alias isn't otherwise explicit. This is slightly more conservative
4121 // than necessary, because it means that each store effectively depends
4122 // on every argument instead of just those arguments it would clobber.
4123 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4124
4125 SmallVector<SDValue, 8> MemOpChains2;
4126 SDValue FIN;
4127 int FI = 0;
4128 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4129 ++I, ++OutsIndex) {
4130 CCValAssign &VA = ArgLocs[I];
4131
4132 if (VA.isRegLoc()) {
4133 if (VA.needsCustom()) {
4134 assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4135, __PRETTY_FUNCTION__))
4135 "Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4135, __PRETTY_FUNCTION__))
;
4136 // This means that we are in special case where one argument was
4137 // passed through two register locations - Skip the next location
4138 ++I;
4139 }
4140
4141 continue;
4142 }
4143
4144 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4144, __PRETTY_FUNCTION__))
;
4145 SDValue Arg = OutVals[OutsIndex];
4146 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4147 // Skip inalloca arguments. They don't require any work.
4148 if (Flags.isInAlloca())
4149 continue;
4150 // Create frame index.
4151 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4152 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4153 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4154 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4155
4156 if (Flags.isByVal()) {
4157 // Copy relative to framepointer.
4158 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4159 if (!StackPtr.getNode())
4160 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4161 getPointerTy(DAG.getDataLayout()));
4162 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4163 StackPtr, Source);
4164
4165 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4166 ArgChain,
4167 Flags, DAG, dl));
4168 } else {
4169 // Store relative to framepointer.
4170 MemOpChains2.push_back(DAG.getStore(
4171 ArgChain, dl, Arg, FIN,
4172 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4173 }
4174 }
4175
4176 if (!MemOpChains2.empty())
4177 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4178
4179 // Store the return address to the appropriate stack slot.
4180 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4181 getPointerTy(DAG.getDataLayout()),
4182 RegInfo->getSlotSize(), FPDiff, dl);
4183 }
4184
4185 // Build a sequence of copy-to-reg nodes chained together with token chain
4186 // and flag operands which copy the outgoing args into registers.
4187 SDValue InFlag;
4188 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4189 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4190 RegsToPass[i].second, InFlag);
4191 InFlag = Chain.getValue(1);
4192 }
4193
4194 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4195 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4195, __PRETTY_FUNCTION__))
;
4196 // In the 64-bit large code model, we have to make all calls
4197 // through a register, since the call instruction's 32-bit
4198 // pc-relative offset may not be large enough to hold the whole
4199 // address.
4200 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4201 Callee->getOpcode() == ISD::ExternalSymbol) {
4202 // Lower direct calls to global addresses and external symbols. Setting
4203 // ForCall to true here has the effect of removing WrapperRIP when possible
4204 // to allow direct calls to be selected without first materializing the
4205 // address into a register.
4206 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4207 } else if (Subtarget.isTarget64BitILP32() &&
4208 Callee->getValueType(0) == MVT::i32) {
4209 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4210 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4211 }
4212
4213 // Returns a chain & a flag for retval copy to use.
4214 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4215 SmallVector<SDValue, 8> Ops;
4216
4217 if (!IsSibcall && isTailCall && !IsMustTail) {
4218 Chain = DAG.getCALLSEQ_END(Chain,
4219 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4220 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4221 InFlag = Chain.getValue(1);
4222 }
4223
4224 Ops.push_back(Chain);
4225 Ops.push_back(Callee);
4226
4227 if (isTailCall)
4228 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
4229
4230 // Add argument registers to the end of the list so that they are known live
4231 // into the call.
4232 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4233 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4234 RegsToPass[i].second.getValueType()));
4235
4236 // Add a register mask operand representing the call-preserved registers.
4237 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
4238 // set X86_INTR calling convention because it has the same CSR mask
4239 // (same preserved registers).
4240 const uint32_t *Mask = RegInfo->getCallPreservedMask(
4241 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
4242 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4242, __PRETTY_FUNCTION__))
;
4243
4244 // If this is an invoke in a 32-bit function using a funclet-based
4245 // personality, assume the function clobbers all registers. If an exception
4246 // is thrown, the runtime will not restore CSRs.
4247 // FIXME: Model this more precisely so that we can register allocate across
4248 // the normal edge and spill and fill across the exceptional edge.
4249 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
4250 const Function &CallerFn = MF.getFunction();
4251 EHPersonality Pers =
4252 CallerFn.hasPersonalityFn()
4253 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4254 : EHPersonality::Unknown;
4255 if (isFuncletEHPersonality(Pers))
4256 Mask = RegInfo->getNoPreservedMask();
4257 }
4258
4259 // Define a new register mask from the existing mask.
4260 uint32_t *RegMask = nullptr;
4261
4262 // In some calling conventions we need to remove the used physical registers
4263 // from the reg mask.
4264 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4265 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4266
4267 // Allocate a new Reg Mask and copy Mask.
4268 RegMask = MF.allocateRegMask();
4269 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4270 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4271
4272 // Make sure all sub registers of the argument registers are reset
4273 // in the RegMask.
4274 for (auto const &RegPair : RegsToPass)
4275 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4276 SubRegs.isValid(); ++SubRegs)
4277 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4278
4279 // Create the RegMask Operand according to our updated mask.
4280 Ops.push_back(DAG.getRegisterMask(RegMask));
4281 } else {
4282 // Create the RegMask Operand according to the static mask.
4283 Ops.push_back(DAG.getRegisterMask(Mask));
4284 }
4285
4286 if (InFlag.getNode())
4287 Ops.push_back(InFlag);
4288
4289 if (isTailCall) {
4290 // We used to do:
4291 //// If this is the first return lowered for this function, add the regs
4292 //// to the liveout set for the function.
4293 // This isn't right, although it's probably harmless on x86; liveouts
4294 // should be computed from returns not tail calls. Consider a void
4295 // function making a tail call to a function returning int.
4296 MF.getFrameInfo().setHasTailCall();
4297 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4298 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4299 return Ret;
4300 }
4301
4302 if (HasNoCfCheck && IsCFProtectionSupported) {
4303 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4304 } else {
4305 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4306 }
4307 InFlag = Chain.getValue(1);
4308 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4309
4310 // Save heapallocsite metadata.
4311 if (CLI.CS)
4312 if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
4313 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4314
4315 // Create the CALLSEQ_END node.
4316 unsigned NumBytesForCalleeToPop;
4317 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4318 DAG.getTarget().Options.GuaranteedTailCallOpt))
4319 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4320 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4321 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4322 SR == StackStructReturn)
4323 // If this is a call to a struct-return function, the callee
4324 // pops the hidden struct pointer, so we have to push it back.
4325 // This is common for Darwin/X86, Linux & Mingw32 targets.
4326 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4327 NumBytesForCalleeToPop = 4;
4328 else
4329 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4330
4331 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
4332 // No need to reset the stack after the call if the call doesn't return. To
4333 // make the MI verify, we'll pretend the callee does it for us.
4334 NumBytesForCalleeToPop = NumBytes;
4335 }
4336
4337 // Returns a flag for retval copy to use.
4338 if (!IsSibcall) {
4339 Chain = DAG.getCALLSEQ_END(Chain,
4340 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4341 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4342 true),
4343 InFlag, dl);
4344 InFlag = Chain.getValue(1);
4345 }
4346
4347 // Handle result values, copying them out of physregs into vregs that we
4348 // return.
4349 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4350 InVals, RegMask);
4351}
4352
4353//===----------------------------------------------------------------------===//
4354// Fast Calling Convention (tail call) implementation
4355//===----------------------------------------------------------------------===//
4356
4357// Like std call, callee cleans arguments, convention except that ECX is
4358// reserved for storing the tail called function address. Only 2 registers are
4359// free for argument passing (inreg). Tail call optimization is performed
4360// provided:
4361// * tailcallopt is enabled
4362// * caller/callee are fastcc
4363// On X86_64 architecture with GOT-style position independent code only local
4364// (within module) calls are supported at the moment.
4365// To keep the stack aligned according to platform abi the function
4366// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4367// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4368// If a tail called function callee has more arguments than the caller the
4369// caller needs to make sure that there is room to move the RETADDR to. This is
4370// achieved by reserving an area the size of the argument delta right after the
4371// original RETADDR, but before the saved framepointer or the spilled registers
4372// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4373// stack layout:
4374// arg1
4375// arg2
4376// RETADDR
4377// [ new RETADDR
4378// move area ]
4379// (possible EBP)
4380// ESI
4381// EDI
4382// local1 ..
4383
4384/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4385/// requirement.
4386unsigned
4387X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4388 SelectionDAG &DAG) const {
4389 const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
4390 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4391 assert(StackSize % SlotSize == 0 &&((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4392, __PRETTY_FUNCTION__))
4392 "StackSize must be a multiple of SlotSize")((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4392, __PRETTY_FUNCTION__))
;
4393 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4394}
4395
4396/// Return true if the given stack call argument is already available in the
4397/// same position (relatively) of the caller's incoming argument stack.
4398static
4399bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4400 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4401 const X86InstrInfo *TII, const CCValAssign &VA) {
4402 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4403
4404 for (;;) {
4405 // Look through nodes that don't alter the bits of the incoming value.
4406 unsigned Op = Arg.getOpcode();
4407 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4408 Arg = Arg.getOperand(0);
4409 continue;
4410 }
4411 if (Op == ISD::TRUNCATE) {
4412 const SDValue &TruncInput = Arg.getOperand(0);
4413 if (TruncInput.getOpcode() == ISD::AssertZext &&
4414 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4415 Arg.getValueType()) {
4416 Arg = TruncInput.getOperand(0);
4417 continue;
4418 }
4419 }
4420 break;
4421 }
4422
4423 int FI = INT_MAX2147483647;
4424 if (Arg.getOpcode() == ISD::CopyFromReg) {
4425 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4426 if (!Register::isVirtualRegister(VR))
4427 return false;
4428 MachineInstr *Def = MRI->getVRegDef(VR);
4429 if (!Def)
4430 return false;
4431 if (!Flags.isByVal()) {
4432 if (!TII->isLoadFromStackSlot(*Def, FI))
4433 return false;
4434 } else {
4435 unsigned Opcode = Def->getOpcode();
4436 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4437 Opcode == X86::LEA64_32r) &&
4438 Def->getOperand(1).isFI()) {
4439 FI = Def->getOperand(1).getIndex();
4440 Bytes = Flags.getByValSize();
4441 } else
4442 return false;
4443 }
4444 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4445 if (Flags.isByVal())
4446 // ByVal argument is passed in as a pointer but it's now being
4447 // dereferenced. e.g.
4448 // define @foo(%struct.X* %A) {
4449 // tail call @bar(%struct.X* byval %A)
4450 // }
4451 return false;
4452 SDValue Ptr = Ld->getBasePtr();
4453 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4454 if (!FINode)
4455 return false;
4456 FI = FINode->getIndex();
4457 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4458 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4459 FI = FINode->getIndex();
4460 Bytes = Flags.getByValSize();
4461 } else
4462 return false;
4463
4464 assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4464, __PRETTY_FUNCTION__))
;
4465 if (!MFI.isFixedObjectIndex(FI))
4466 return false;
4467
4468 if (Offset != MFI.getObjectOffset(FI))
4469 return false;
4470
4471 // If this is not byval, check that the argument stack object is immutable.
4472 // inalloca and argument copy elision can create mutable argument stack
4473 // objects. Byval objects can be mutated, but a byval call intends to pass the
4474 // mutated memory.
4475 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4476 return false;
4477
4478 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4479 // If the argument location is wider than the argument type, check that any
4480 // extension flags match.
4481 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4482 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4483 return false;
4484 }
4485 }
4486
4487 return Bytes == MFI.getObjectSize(FI);
4488}
4489
4490/// Check whether the call is eligible for tail call optimization. Targets
4491/// that want to do tail call optimization should implement this function.
4492bool X86TargetLowering::IsEligibleForTailCallOptimization(
4493 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4494 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4495 const SmallVectorImpl<ISD::OutputArg> &Outs,
4496 const SmallVectorImpl<SDValue> &OutVals,
4497 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4498 if (!mayTailCallThisCC(CalleeCC))
4499 return false;
4500
4501 // If -tailcallopt is specified, make fastcc functions tail-callable.
4502 MachineFunction &MF = DAG.getMachineFunction();
4503 const Function &CallerF = MF.getFunction();
4504
4505 // If the function return type is x86_fp80 and the callee return type is not,
4506 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4507 // perform a tailcall optimization here.
4508 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4509 return false;
4510
4511 CallingConv::ID CallerCC = CallerF.getCallingConv();
4512 bool CCMatch = CallerCC == CalleeCC;
4513 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4514 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4515 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4516 CalleeCC == CallingConv::Tail;
4517
4518 // Win64 functions have extra shadow space for argument homing. Don't do the
4519 // sibcall if the caller and callee have mismatched expectations for this
4520 // space.
4521 if (IsCalleeWin64 != IsCallerWin64)
4522 return false;
4523
4524 if (IsGuaranteeTCO) {
4525 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4526 return true;
4527 return false;
4528 }
4529
4530 // Look for obvious safe cases to perform tail call optimization that do not
4531 // require ABI changes. This is what gcc calls sibcall.
4532
4533 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4534 // emit a special epilogue.
4535 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4536 if (RegInfo->needsStackRealignment(MF))
4537 return false;
4538
4539 // Also avoid sibcall optimization if either caller or callee uses struct
4540 // return semantics.
4541 if (isCalleeStructRet || isCallerStructRet)
4542 return false;
4543
4544 // Do not sibcall optimize vararg calls unless all arguments are passed via
4545 // registers.
4546 LLVMContext &C = *DAG.getContext();
4547 if (isVarArg && !Outs.empty()) {
4548 // Optimizing for varargs on Win64 is unlikely to be safe without
4549 // additional testing.
4550 if (IsCalleeWin64 || IsCallerWin64)
4551 return false;
4552
4553 SmallVector<CCValAssign, 16> ArgLocs;
4554 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4555
4556 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4557 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4558 if (!ArgLocs[i].isRegLoc())
4559 return false;
4560 }
4561
4562 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4563 // stack. Therefore, if it's not used by the call it is not safe to optimize
4564 // this into a sibcall.
4565 bool Unused = false;
4566 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4567 if (!Ins[i].Used) {
4568 Unused = true;
4569 break;
4570 }
4571 }
4572 if (Unused) {
4573 SmallVector<CCValAssign, 16> RVLocs;
4574 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4575 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4576 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4577 CCValAssign &VA = RVLocs[i];
4578 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4579 return false;
4580 }
4581 }
4582
4583 // Check that the call results are passed in the same way.
4584 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4585 RetCC_X86, RetCC_X86))
4586 return false;
4587 // The callee has to preserve all registers the caller needs to preserve.
4588 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4589 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4590 if (!CCMatch) {
4591 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4592 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4593 return false;
4594 }
4595
4596 unsigned StackArgsSize = 0;
4597
4598 // If the callee takes no arguments then go on to check the results of the
4599 // call.
4600 if (!Outs.empty()) {
4601 // Check if stack adjustment is needed. For now, do not do this if any
4602 // argument is passed on the stack.
4603 SmallVector<CCValAssign, 16> ArgLocs;
4604 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4605
4606 // Allocate shadow area for Win64
4607 if (IsCalleeWin64)
4608 CCInfo.AllocateStack(32, 8);
4609
4610 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4611 StackArgsSize = CCInfo.getNextStackOffset();
4612
4613 if (CCInfo.getNextStackOffset()) {
4614 // Check if the arguments are already laid out in the right way as
4615 // the caller's fixed stack objects.
4616 MachineFrameInfo &MFI = MF.getFrameInfo();
4617 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4618 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4619 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4620 CCValAssign &VA = ArgLocs[i];
4621 SDValue Arg = OutVals[i];
4622 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4623 if (VA.getLocInfo() == CCValAssign::Indirect)
4624 return false;
4625 if (!VA.isRegLoc()) {
4626 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4627 MFI, MRI, TII, VA))
4628 return false;
4629 }
4630 }
4631 }
4632
4633 bool PositionIndependent = isPositionIndependent();
4634 // If the tailcall address may be in a register, then make sure it's
4635 // possible to register allocate for it. In 32-bit, the call address can
4636 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4637 // callee-saved registers are restored. These happen to be the same
4638 // registers used to pass 'inreg' arguments so watch out for those.
4639 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4640 !isa<ExternalSymbolSDNode>(Callee)) ||
4641 PositionIndependent)) {
4642 unsigned NumInRegs = 0;
4643 // In PIC we need an extra register to formulate the address computation
4644 // for the callee.
4645 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4646
4647 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4648 CCValAssign &VA = ArgLocs[i];
4649 if (!VA.isRegLoc())
4650 continue;
4651 Register Reg = VA.getLocReg();
4652 switch (Reg) {
4653 default: break;
4654 case X86::EAX: case X86::EDX: case X86::ECX:
4655 if (++NumInRegs == MaxInRegs)
4656 return false;
4657 break;
4658 }
4659 }
4660 }
4661
4662 const MachineRegisterInfo &MRI = MF.getRegInfo();
4663 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4664 return false;
4665 }
4666
4667 bool CalleeWillPop =
4668 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4669 MF.getTarget().Options.GuaranteedTailCallOpt);
4670
4671 if (unsigned BytesToPop =
4672 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4673 // If we have bytes to pop, the callee must pop them.
4674 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4675 if (!CalleePopMatches)
4676 return false;
4677 } else if (CalleeWillPop && StackArgsSize > 0) {
4678 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4679 return false;
4680 }
4681
4682 return true;
4683}
4684
4685FastISel *
4686X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4687 const TargetLibraryInfo *libInfo) const {
4688 return X86::createFastISel(funcInfo, libInfo);
4689}
4690
4691//===----------------------------------------------------------------------===//
4692// Other Lowering Hooks
4693//===----------------------------------------------------------------------===//
4694
4695static bool MayFoldLoad(SDValue Op) {
4696 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4697}
4698
4699static bool MayFoldIntoStore(SDValue Op) {
4700 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4701}
4702
4703static bool MayFoldIntoZeroExtend(SDValue Op) {
4704 if (Op.hasOneUse()) {
4705 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4706 return (ISD::ZERO_EXTEND == Opcode);
4707 }
4708 return false;
4709}
4710
4711static bool isTargetShuffle(unsigned Opcode) {
4712 switch(Opcode) {
4713 default: return false;
4714 case X86ISD::BLENDI:
4715 case X86ISD::PSHUFB:
4716 case X86ISD::PSHUFD:
4717 case X86ISD::PSHUFHW:
4718 case X86ISD::PSHUFLW:
4719 case X86ISD::SHUFP:
4720 case X86ISD::INSERTPS:
4721 case X86ISD::EXTRQI:
4722 case X86ISD::INSERTQI:
4723 case X86ISD::PALIGNR:
4724 case X86ISD::VSHLDQ:
4725 case X86ISD::VSRLDQ:
4726 case X86ISD::MOVLHPS:
4727 case X86ISD::MOVHLPS:
4728 case X86ISD::MOVSHDUP:
4729 case X86ISD::MOVSLDUP:
4730 case X86ISD::MOVDDUP:
4731 case X86ISD::MOVSS:
4732 case X86ISD::MOVSD:
4733 case X86ISD::UNPCKL:
4734 case X86ISD::UNPCKH:
4735 case X86ISD::VBROADCAST:
4736 case X86ISD::VPERMILPI:
4737 case X86ISD::VPERMILPV:
4738 case X86ISD::VPERM2X128:
4739 case X86ISD::SHUF128:
4740 case X86ISD::VPERMIL2:
4741 case X86ISD::VPERMI:
4742 case X86ISD::VPPERM:
4743 case X86ISD::VPERMV:
4744 case X86ISD::VPERMV3:
4745 case X86ISD::VZEXT_MOVL:
4746 return true;
4747 }
4748}
4749
4750static bool isTargetShuffleVariableMask(unsigned Opcode) {
4751 switch (Opcode) {
4752 default: return false;
4753 // Target Shuffles.
4754 case X86ISD::PSHUFB:
4755 case X86ISD::VPERMILPV:
4756 case X86ISD::VPERMIL2:
4757 case X86ISD::VPPERM:
4758 case X86ISD::VPERMV:
4759 case X86ISD::VPERMV3:
4760 return true;
4761 // 'Faux' Target Shuffles.
4762 case ISD::OR:
4763 case ISD::AND:
4764 case X86ISD::ANDNP:
4765 return true;
4766 }
4767}
4768
4769SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4770 MachineFunction &MF = DAG.getMachineFunction();
4771 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4772 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4773 int ReturnAddrIndex = FuncInfo->getRAIndex();
4774
4775 if (ReturnAddrIndex == 0) {
4776 // Set up a frame object for the return address.
4777 unsigned SlotSize = RegInfo->getSlotSize();
4778 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4779 -(int64_t)SlotSize,
4780 false);
4781 FuncInfo->setRAIndex(ReturnAddrIndex);
4782 }
4783
4784 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4785}
4786
4787bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4788 bool hasSymbolicDisplacement) {
4789 // Offset should fit into 32 bit immediate field.
4790 if (!isInt<32>(Offset))
4791 return false;
4792
4793 // If we don't have a symbolic displacement - we don't have any extra
4794 // restrictions.
4795 if (!hasSymbolicDisplacement)
4796 return true;
4797
4798 // FIXME: Some tweaks might be needed for medium code model.
4799 if (M != CodeModel::Small && M != CodeModel::Kernel)
4800 return false;
4801
4802 // For small code model we assume that latest object is 16MB before end of 31
4803 // bits boundary. We may also accept pretty large negative constants knowing
4804 // that all objects are in the positive half of address space.
4805 if (M == CodeModel::Small && Offset < 16*1024*1024)
4806 return true;
4807
4808 // For kernel code model we know that all object resist in the negative half
4809 // of 32bits address space. We may not accept negative offsets, since they may
4810 // be just off and we may accept pretty large positive ones.
4811 if (M == CodeModel::Kernel && Offset >= 0)
4812 return true;
4813
4814 return false;
4815}
4816
4817/// Determines whether the callee is required to pop its own arguments.
4818/// Callee pop is necessary to support tail calls.
4819bool X86::isCalleePop(CallingConv::ID CallingConv,
4820 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4821 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4822 // can guarantee TCO.
4823 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4824 return true;
4825
4826 switch (CallingConv) {
4827 default:
4828 return false;
4829 case CallingConv::X86_StdCall:
4830 case CallingConv::X86_FastCall:
4831 case CallingConv::X86_ThisCall:
4832 case CallingConv::X86_VectorCall:
4833 return !is64Bit;
4834 }
4835}
4836
4837/// Return true if the condition is an signed comparison operation.
4838static bool isX86CCSigned(unsigned X86CC) {
4839 switch (X86CC) {
4840 default:
4841 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4841)
;
4842 case X86::COND_E:
4843 case X86::COND_NE:
4844 case X86::COND_B:
4845 case X86::COND_A:
4846 case X86::COND_BE:
4847 case X86::COND_AE:
4848 return false;
4849 case X86::COND_G:
4850 case X86::COND_GE:
4851 case X86::COND_L:
4852 case X86::COND_LE:
4853 return true;
4854 }
4855}
4856
4857static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4858 switch (SetCCOpcode) {
4859 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4859)
;
4860 case ISD::SETEQ: return X86::COND_E;
4861 case ISD::SETGT: return X86::COND_G;
4862 case ISD::SETGE: return X86::COND_GE;
4863 case ISD::SETLT: return X86::COND_L;
4864 case ISD::SETLE: return X86::COND_LE;
4865 case ISD::SETNE: return X86::COND_NE;
4866 case ISD::SETULT: return X86::COND_B;
4867 case ISD::SETUGT: return X86::COND_A;
4868 case ISD::SETULE: return X86::COND_BE;
4869 case ISD::SETUGE: return X86::COND_AE;
4870 }
4871}
4872
4873/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4874/// condition code, returning the condition code and the LHS/RHS of the
4875/// comparison to make.
4876static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4877 bool isFP, SDValue &LHS, SDValue &RHS,
4878 SelectionDAG &DAG) {
4879 if (!isFP) {
4880 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4881 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4882 // X > -1 -> X == 0, jump !sign.
4883 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4884 return X86::COND_NS;
4885 }
4886 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4887 // X < 0 -> X == 0, jump on sign.
4888 return X86::COND_S;
4889 }
4890 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
4891 // X >= 0 -> X == 0, jump on !sign.
4892 return X86::COND_NS;
4893 }
4894 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
4895 // X < 1 -> X <= 0
4896 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4897 return X86::COND_LE;
4898 }
4899 }
4900
4901 return TranslateIntegerX86CC(SetCCOpcode);
4902 }
4903
4904 // First determine if it is required or is profitable to flip the operands.
4905
4906 // If LHS is a foldable load, but RHS is not, flip the condition.
4907 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4908 !ISD::isNON_EXTLoad(RHS.getNode())) {
4909 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4910 std::swap(LHS, RHS);
4911 }
4912
4913 switch (SetCCOpcode) {
4914 default: break;
4915 case ISD::SETOLT:
4916 case ISD::SETOLE:
4917 case ISD::SETUGT:
4918 case ISD::SETUGE:
4919 std::swap(LHS, RHS);
4920 break;
4921 }
4922
4923 // On a floating point condition, the flags are set as follows:
4924 // ZF PF CF op
4925 // 0 | 0 | 0 | X > Y
4926 // 0 | 0 | 1 | X < Y
4927 // 1 | 0 | 0 | X == Y
4928 // 1 | 1 | 1 | unordered
4929 switch (SetCCOpcode) {
4930 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4930)
;
4931 case ISD::SETUEQ:
4932 case ISD::SETEQ: return X86::COND_E;
4933 case ISD::SETOLT: // flipped
4934 case ISD::SETOGT:
4935 case ISD::SETGT: return X86::COND_A;
4936 case ISD::SETOLE: // flipped
4937 case ISD::SETOGE:
4938 case ISD::SETGE: return X86::COND_AE;
4939 case ISD::SETUGT: // flipped
4940 case ISD::SETULT:
4941 case ISD::SETLT: return X86::COND_B;
4942 case ISD::SETUGE: // flipped
4943 case ISD::SETULE:
4944 case ISD::SETLE: return X86::COND_BE;
4945 case ISD::SETONE:
4946 case ISD::SETNE: return X86::COND_NE;
4947 case ISD::SETUO: return X86::COND_P;
4948 case ISD::SETO: return X86::COND_NP;
4949 case ISD::SETOEQ:
4950 case ISD::SETUNE: return X86::COND_INVALID;
4951 }
4952}
4953
4954/// Is there a floating point cmov for the specific X86 condition code?
4955/// Current x86 isa includes the following FP cmov instructions:
4956/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4957static bool hasFPCMov(unsigned X86CC) {
4958 switch (X86CC) {
4959 default:
4960 return false;
4961 case X86::COND_B:
4962 case X86::COND_BE:
4963 case X86::COND_E:
4964 case X86::COND_P:
4965 case X86::COND_A:
4966 case X86::COND_AE:
4967 case X86::COND_NE:
4968 case X86::COND_NP:
4969 return true;
4970 }
4971}
4972
4973
4974bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4975 const CallInst &I,
4976 MachineFunction &MF,
4977 unsigned Intrinsic) const {
4978
4979 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4980 if (!IntrData)
4981 return false;
4982
4983 Info.flags = MachineMemOperand::MONone;
4984 Info.offset = 0;
4985
4986 switch (IntrData->Type) {
4987 case TRUNCATE_TO_MEM_VI8:
4988 case TRUNCATE_TO_MEM_VI16:
4989 case TRUNCATE_TO_MEM_VI32: {
4990 Info.opc = ISD::INTRINSIC_VOID;
4991 Info.ptrVal = I.getArgOperand(0);
4992 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4993 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4994 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4995 ScalarVT = MVT::i8;
4996 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4997 ScalarVT = MVT::i16;
4998 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4999 ScalarVT = MVT::i32;
5000
5001 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5002 Info.align = Align(1);
5003 Info.flags |= MachineMemOperand::MOStore;
5004 break;
5005 }
5006 case GATHER:
5007 case GATHER_AVX2: {
5008 Info.opc = ISD::INTRINSIC_W_CHAIN;
5009 Info.ptrVal = nullptr;
5010 MVT DataVT = MVT::getVT(I.getType());
5011 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5012 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5013 IndexVT.getVectorNumElements());
5014 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5015 Info.align = Align(1);
5016 Info.flags |= MachineMemOperand::MOLoad;
5017 break;
5018 }
5019 case SCATTER: {
5020 Info.opc = ISD::INTRINSIC_VOID;
5021 Info.ptrVal = nullptr;
5022 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5023 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5024 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5025 IndexVT.getVectorNumElements());
5026 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5027 Info.align = Align(1);
5028 Info.flags |= MachineMemOperand::MOStore;
5029 break;
5030 }
5031 default:
5032 return false;
5033 }
5034
5035 return true;
5036}
5037
5038/// Returns true if the target can instruction select the
5039/// specified FP immediate natively. If false, the legalizer will
5040/// materialize the FP immediate as a load from a constant pool.
5041bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5042 bool ForCodeSize) const {
5043 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5044 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5045 return true;
5046 }
5047 return false;
5048}
5049
5050bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5051 ISD::LoadExtType ExtTy,
5052 EVT NewVT) const {
5053 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"
) ? static_cast<void> (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5053, __PRETTY_FUNCTION__))
;
5054
5055 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5056 // relocation target a movq or addq instruction: don't let the load shrink.
5057 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5058 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5059 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5060 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5061
5062 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5063 // those uses are extracted directly into a store, then the extract + store
5064 // can be store-folded. Therefore, it's probably not worth splitting the load.
5065 EVT VT = Load->getValueType(0);
5066 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5067 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5068 // Skip uses of the chain value. Result 0 of the node is the load value.
5069 if (UI.getUse().getResNo() != 0)
5070 continue;
5071
5072 // If this use is not an extract + store, it's probably worth splitting.
5073 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5074 UI->use_begin()->getOpcode() != ISD::STORE)
5075 return true;
5076 }
5077 // All non-chain uses are extract + store.
5078 return false;
5079 }
5080
5081 return true;
5082}
5083
5084/// Returns true if it is beneficial to convert a load of a constant
5085/// to just the constant itself.
5086bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5087 Type *Ty) const {
5088 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5088, __PRETTY_FUNCTION__))
;
5089
5090 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5091 if (BitSize == 0 || BitSize > 64)
5092 return false;
5093 return true;
5094}
5095
5096bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5097 // If we are using XMM registers in the ABI and the condition of the select is
5098 // a floating-point compare and we have blendv or conditional move, then it is
5099 // cheaper to select instead of doing a cross-register move and creating a
5100 // load that depends on the compare result.
5101 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5102 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5103}
5104
5105bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5106 // TODO: It might be a win to ease or lift this restriction, but the generic
5107 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5108 if (VT.isVector() && Subtarget.hasAVX512())
5109 return false;
5110
5111 return true;
5112}
5113
5114bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5115 SDValue C) const {
5116 // TODO: We handle scalars using custom code, but generic combining could make
5117 // that unnecessary.
5118 APInt MulC;
5119 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5120 return false;
5121
5122 // Find the type this will be legalized too. Otherwise we might prematurely
5123 // convert this to shl+add/sub and then still have to type legalize those ops.
5124 // Another choice would be to defer the decision for illegal types until
5125 // after type legalization. But constant splat vectors of i64 can't make it
5126 // through type legalization on 32-bit targets so we would need to special
5127 // case vXi64.
5128 while (getTypeAction(Context, VT) != TypeLegal)
5129 VT = getTypeToTransformTo(Context, VT);
5130
5131 // If vector multiply is legal, assume that's faster than shl + add/sub.
5132 // TODO: Multiply is a complex op with higher latency and lower throughput in
5133 // most implementations, so this check could be loosened based on type
5134 // and/or a CPU attribute.
5135 if (isOperationLegal(ISD::MUL, VT))
5136 return false;
5137
5138 // shl+add, shl+sub, shl+add+neg
5139 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5140 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5141}
5142
5143bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5144 unsigned Index) const {
5145 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5146 return false;
5147
5148 // Mask vectors support all subregister combinations and operations that
5149 // extract half of vector.
5150 if (ResVT.getVectorElementType() == MVT::i1)
5151 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5152 (Index == ResVT.getVectorNumElements()));
5153
5154 return (Index % ResVT.getVectorNumElements()) == 0;
5155}
5156
5157bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5158 unsigned Opc = VecOp.getOpcode();
5159
5160 // Assume target opcodes can't be scalarized.
5161 // TODO - do we have any exceptions?
5162 if (Opc >= ISD::BUILTIN_OP_END)
5163 return false;
5164
5165 // If the vector op is not supported, try to convert to scalar.
5166 EVT VecVT = VecOp.getValueType();
5167 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5168 return true;
5169
5170 // If the vector op is supported, but the scalar op is not, the transform may
5171 // not be worthwhile.
5172 EVT ScalarVT = VecVT.getScalarType();
5173 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5174}
5175
5176bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5177 bool) const {
5178 // TODO: Allow vectors?
5179 if (VT.isVector())
5180 return false;
5181 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5182}
5183
5184bool X86TargetLowering::isCheapToSpeculateCttz() const {
5185 // Speculate cttz only if we can directly use TZCNT.
5186 return Subtarget.hasBMI();
5187}
5188
5189bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5190 // Speculate ctlz only if we can directly use LZCNT.
5191 return Subtarget.hasLZCNT();
5192}
5193
5194bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5195 const SelectionDAG &DAG,
5196 const MachineMemOperand &MMO) const {
5197 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5198 BitcastVT.getVectorElementType() == MVT::i1)
5199 return false;
5200
5201 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5202 return false;
5203
5204 // If both types are legal vectors, it's always ok to convert them.
5205 if (LoadVT.isVector() && BitcastVT.isVector() &&
5206 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5207 return true;
5208
5209 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5210}
5211
5212bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5213 const SelectionDAG &DAG) const {
5214 // Do not merge to float value size (128 bytes) if no implicit
5215 // float attribute is set.
5216 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5217 Attribute::NoImplicitFloat);
5218
5219 if (NoFloat) {
5220 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5221 return (MemVT.getSizeInBits() <= MaxIntSize);
5222 }
5223 // Make sure we don't merge greater than our preferred vector
5224 // width.
5225 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5226 return false;
5227 return true;
5228}
5229
5230bool X86TargetLowering::isCtlzFast() const {
5231 return Subtarget.hasFastLZCNT();
5232}
5233
5234bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5235 const Instruction &AndI) const {
5236 return true;
5237}
5238
5239bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5240 EVT VT = Y.getValueType();
5241
5242 if (VT.isVector())
5243 return false;
5244
5245 if (!Subtarget.hasBMI())
5246 return false;
5247
5248 // There are only 32-bit and 64-bit forms for 'andn'.
5249 if (VT != MVT::i32 && VT != MVT::i64)
5250 return false;
5251
5252 return !isa<ConstantSDNode>(Y);
5253}
5254
5255bool X86TargetLowering::hasAndNot(SDValue Y) const {
5256 EVT VT = Y.getValueType();
5257
5258 if (!VT.isVector())
5259 return hasAndNotCompare(Y);
5260
5261 // Vector.
5262
5263 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5264 return false;
5265
5266 if (VT == MVT::v4i32)
5267 return true;
5268
5269 return Subtarget.hasSSE2();
5270}
5271
5272bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5273 return X.getValueType().isScalarInteger(); // 'bt'
5274}
5275
5276bool X86TargetLowering::
5277 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5278 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5279 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5280 SelectionDAG &DAG) const {
5281 // Does baseline recommend not to perform the fold by default?
5282 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5283 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5284 return false;
5285 // For scalars this transform is always beneficial.
5286 if (X.getValueType().isScalarInteger())
5287 return true;
5288 // If all the shift amounts are identical, then transform is beneficial even
5289 // with rudimentary SSE2 shifts.
5290 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5291 return true;
5292 // If we have AVX2 with it's powerful shift operations, then it's also good.
5293 if (Subtarget.hasAVX2())
5294 return true;
5295 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5296 return NewShiftOpcode == ISD::SHL;
5297}
5298
5299bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5300 const SDNode *N, CombineLevel Level) const {
5301 assert(((N->getOpcode() == ISD::SHL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5305, __PRETTY_FUNCTION__))
5302 N->getOperand(0).getOpcode() == ISD::SRL) ||((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5305, __PRETTY_FUNCTION__))
5303 (N->getOpcode() == ISD::SRL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5305, __PRETTY_FUNCTION__))
5304 N->getOperand(0).getOpcode() == ISD::SHL)) &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5305, __PRETTY_FUNCTION__))
5305 "Expected shift-shift mask")((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5305, __PRETTY_FUNCTION__))
;
5306 EVT VT = N->getValueType(0);
5307 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5308 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5309 // Only fold if the shift values are equal - so it folds to AND.
5310 // TODO - we should fold if either is a non-uniform vector but we don't do
5311 // the fold for non-splats yet.
5312 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5313 }
5314 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5315}
5316
5317bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5318 EVT VT = Y.getValueType();
5319
5320 // For vectors, we don't have a preference, but we probably want a mask.
5321 if (VT.isVector())
5322 return false;
5323
5324 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5325 if (VT == MVT::i64 && !Subtarget.is64Bit())
5326 return false;
5327
5328 return true;
5329}
5330
5331bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5332 SDNode *N) const {
5333 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5334 !Subtarget.isOSWindows())
5335 return false;
5336 return true;
5337}
5338
5339bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5340 // Any legal vector type can be splatted more efficiently than
5341 // loading/spilling from memory.
5342 return isTypeLegal(VT);
5343}
5344
5345MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5346 MVT VT = MVT::getIntegerVT(NumBits);
5347 if (isTypeLegal(VT))
5348 return VT;
5349
5350 // PMOVMSKB can handle this.
5351 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5352 return MVT::v16i8;
5353
5354 // VPMOVMSKB can handle this.
5355 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5356 return MVT::v32i8;
5357
5358 // TODO: Allow 64-bit type for 32-bit target.
5359 // TODO: 512-bit types should be allowed, but make sure that those
5360 // cases are handled in combineVectorSizedSetCCEquality().
5361
5362 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5363}
5364
5365/// Val is the undef sentinel value or equal to the specified value.
5366static bool isUndefOrEqual(int Val, int CmpVal) {
5367 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5368}
5369
5370/// Val is either the undef or zero sentinel value.
5371static bool isUndefOrZero(int Val) {
5372 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5373}
5374
5375/// Return true if every element in Mask, beginning from position Pos and ending
5376/// in Pos+Size is the undef sentinel value.
5377static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5378 return llvm::all_of(Mask.slice(Pos, Size),
5379 [](int M) { return M == SM_SentinelUndef; });
5380}
5381
5382/// Return true if the mask creates a vector whose lower half is undefined.
5383static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5384 unsigned NumElts = Mask.size();
5385 return isUndefInRange(Mask, 0, NumElts / 2);
5386}
5387
5388/// Return true if the mask creates a vector whose upper half is undefined.
5389static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5390 unsigned NumElts = Mask.size();
5391 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5392}
5393
5394/// Return true if Val falls within the specified range (L, H].
5395static bool isInRange(int Val, int Low, int Hi) {
5396 return (Val >= Low && Val < Hi);
5397}
5398
5399/// Return true if the value of any element in Mask falls within the specified
5400/// range (L, H].
5401static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5402 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5403}
5404
5405/// Return true if Val is undef or if its value falls within the
5406/// specified range (L, H].
5407static bool isUndefOrInRange(int Val, int Low, int Hi) {
5408 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5409}
5410
5411/// Return true if every element in Mask is undef or if its value
5412/// falls within the specified range (L, H].
5413static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5414 return llvm::all_of(
5415 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5416}
5417
5418/// Return true if Val is undef, zero or if its value falls within the
5419/// specified range (L, H].
5420static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5421 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5422}
5423
5424/// Return true if every element in Mask is undef, zero or if its value
5425/// falls within the specified range (L, H].
5426static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5427 return llvm::all_of(
5428 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5429}
5430
5431/// Return true if every element in Mask, beginning
5432/// from position Pos and ending in Pos + Size, falls within the specified
5433/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5434static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5435 unsigned Size, int Low, int Step = 1) {
5436 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5437 if (!isUndefOrEqual(Mask[i], Low))
5438 return false;
5439 return true;
5440}
5441
5442/// Return true if every element in Mask, beginning
5443/// from position Pos and ending in Pos+Size, falls within the specified
5444/// sequential range (Low, Low+Size], or is undef or is zero.
5445static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5446 unsigned Size, int Low,
5447 int Step = 1) {
5448 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5449 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5450 return false;
5451 return true;
5452}
5453
5454/// Return true if every element in Mask, beginning
5455/// from position Pos and ending in Pos+Size is undef or is zero.
5456static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5457 unsigned Size) {
5458 return llvm::all_of(Mask.slice(Pos, Size),
5459 [](int M) { return isUndefOrZero(M); });
5460}
5461
5462/// Helper function to test whether a shuffle mask could be
5463/// simplified by widening the elements being shuffled.
5464///
5465/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5466/// leaves it in an unspecified state.
5467///
5468/// NOTE: This must handle normal vector shuffle masks and *target* vector
5469/// shuffle masks. The latter have the special property of a '-2' representing
5470/// a zero-ed lane of a vector.
5471static bool canWidenShuffleElements(ArrayRef<int> Mask,
5472 SmallVectorImpl<int> &WidenedMask) {
5473 WidenedMask.assign(Mask.size() / 2, 0);
5474 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5475 int M0 = Mask[i];
5476 int M1 = Mask[i + 1];
5477
5478 // If both elements are undef, its trivial.
5479 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5480 WidenedMask[i / 2] = SM_SentinelUndef;
5481 continue;
5482 }
5483
5484 // Check for an undef mask and a mask value properly aligned to fit with
5485 // a pair of values. If we find such a case, use the non-undef mask's value.
5486 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5487 WidenedMask[i / 2] = M1 / 2;
5488 continue;
5489 }
5490 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5491 WidenedMask[i / 2] = M0 / 2;
5492 continue;
5493 }
5494
5495 // When zeroing, we need to spread the zeroing across both lanes to widen.
5496 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5497 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5498 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5499 WidenedMask[i / 2] = SM_SentinelZero;
5500 continue;
5501 }
5502 return false;
5503 }
5504
5505 // Finally check if the two mask values are adjacent and aligned with
5506 // a pair.
5507 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5508 WidenedMask[i / 2] = M0 / 2;
5509 continue;
5510 }
5511
5512 // Otherwise we can't safely widen the elements used in this shuffle.
5513 return false;
5514 }
5515 assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5516, __PRETTY_FUNCTION__))
5516 "Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5516, __PRETTY_FUNCTION__))
;
5517
5518 return true;
5519}
5520
5521static bool canWidenShuffleElements(ArrayRef<int> Mask,
5522 const APInt &Zeroable,
5523 bool V2IsZero,
5524 SmallVectorImpl<int> &WidenedMask) {
5525 // Create an alternative mask with info about zeroable elements.
5526 // Here we do not set undef elements as zeroable.
5527 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5528 if (V2IsZero) {
5529 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((!Zeroable.isNullValue() && "V2's non-undef elements are used?!"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5529, __PRETTY_FUNCTION__))
;
5530 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5531 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5532 ZeroableMask[i] = SM_SentinelZero;
5533 }
5534 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5535}
5536
5537static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5538 SmallVector<int, 32> WidenedMask;
5539 return canWidenShuffleElements(Mask, WidenedMask);
5540}
5541
5542/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5543bool X86::isZeroNode(SDValue Elt) {
5544 return isNullConstant(Elt) || isNullFPConstant(Elt);
5545}
5546
5547// Build a vector of constants.
5548// Use an UNDEF node if MaskElt == -1.
5549// Split 64-bit constants in the 32-bit mode.
5550static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5551 const SDLoc &dl, bool IsMask = false) {
5552
5553 SmallVector<SDValue, 32> Ops;
5554 bool Split = false;
5555
5556 MVT ConstVecVT = VT;
5557 unsigned NumElts = VT.getVectorNumElements();
5558 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5559 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5560 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5561 Split = true;
5562 }
5563
5564 MVT EltVT = ConstVecVT.getVectorElementType();
5565 for (unsigned i = 0; i < NumElts; ++i) {
5566 bool IsUndef = Values[i] < 0 && IsMask;
5567 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5568 DAG.getConstant(Values[i], dl, EltVT);
5569 Ops.push_back(OpNode);
5570 if (Split)
5571 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5572 DAG.getConstant(0, dl, EltVT));
5573 }
5574 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5575 if (Split)
5576 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5577 return ConstsNode;
5578}
5579
5580static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5581 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5582 assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5583, __PRETTY_FUNCTION__))
5583 "Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5583, __PRETTY_FUNCTION__))
;
5584 SmallVector<SDValue, 32> Ops;
5585 bool Split = false;
5586
5587 MVT ConstVecVT = VT;
5588 unsigned NumElts = VT.getVectorNumElements();
5589 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5590 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5591 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5592 Split = true;
5593 }
5594
5595 MVT EltVT = ConstVecVT.getVectorElementType();
5596 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5597 if (Undefs[i]) {
5598 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5599 continue;
5600 }
5601 const APInt &V = Bits[i];
5602 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"
) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5602, __PRETTY_FUNCTION__))
;
5603 if (Split) {
5604 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5605 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5606 } else if (EltVT == MVT::f32) {
5607 APFloat FV(APFloat::IEEEsingle(), V);
5608 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5609 } else if (EltVT == MVT::f64) {
5610 APFloat FV(APFloat::IEEEdouble(), V);
5611 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5612 } else {
5613 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5614 }
5615 }
5616
5617 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5618 return DAG.getBitcast(VT, ConstsNode);
5619}
5620
5621/// Returns a vector of specified type with all zero elements.
5622static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5623 SelectionDAG &DAG, const SDLoc &dl) {
5624 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5626, __PRETTY_FUNCTION__))
5625 VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5626, __PRETTY_FUNCTION__))
5626 "Unexpected vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5626, __PRETTY_FUNCTION__))
;
5627
5628 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5629 // type. This ensures they get CSE'd. But if the integer type is not
5630 // available, use a floating-point +0.0 instead.
5631 SDValue Vec;
5632 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5633 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5634 } else if (VT.isFloatingPoint()) {
5635 Vec = DAG.getConstantFP(+0.0, dl, VT);
5636 } else if (VT.getVectorElementType() == MVT::i1) {
5637 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5638, __PRETTY_FUNCTION__))
5638 "Unexpected vector type")(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5638, __PRETTY_FUNCTION__))
;
5639 Vec = DAG.getConstant(0, dl, VT);
5640 } else {
5641 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5642 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5643 }
5644 return DAG.getBitcast(VT, Vec);
5645}
5646
5647static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5648 const SDLoc &dl, unsigned vectorWidth) {
5649 EVT VT = Vec.getValueType();
5650 EVT ElVT = VT.getVectorElementType();
5651 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5652 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5653 VT.getVectorNumElements()/Factor);
5654
5655 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5656 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5657 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5657, __PRETTY_FUNCTION__))
;
5658
5659 // This is the index of the first element of the vectorWidth-bit chunk
5660 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5661 IdxVal &= ~(ElemsPerChunk - 1);
5662
5663 // If the input is a buildvector just emit a smaller one.
5664 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5665 return DAG.getBuildVector(ResultVT, dl,
5666 Vec->ops().slice(IdxVal, ElemsPerChunk));
5667
5668 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5669 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5670}
5671
5672/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5673/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5674/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5675/// instructions or a simple subregister reference. Idx is an index in the
5676/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5677/// lowering EXTRACT_VECTOR_ELT operations easier.
5678static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5679 SelectionDAG &DAG, const SDLoc &dl) {
5680 assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5681, __PRETTY_FUNCTION__))
5681 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5681, __PRETTY_FUNCTION__))
;
5682 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5683}
5684
5685/// Generate a DAG to grab 256-bits from a 512-bit vector.
5686static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5687 SelectionDAG &DAG, const SDLoc &dl) {
5688 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5688, __PRETTY_FUNCTION__))
;
5689 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5690}
5691
5692static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5693 SelectionDAG &DAG, const SDLoc &dl,
5694 unsigned vectorWidth) {
5695 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5696, __PRETTY_FUNCTION__))
5696 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5696, __PRETTY_FUNCTION__))
;
5697 // Inserting UNDEF is Result
5698 if (Vec.isUndef())
5699 return Result;
5700 EVT VT = Vec.getValueType();
5701 EVT ElVT = VT.getVectorElementType();
5702 EVT ResultVT = Result.getValueType();
5703
5704 // Insert the relevant vectorWidth bits.
5705 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5706 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5706, __PRETTY_FUNCTION__))
;
5707
5708 // This is the index of the first element of the vectorWidth-bit chunk
5709 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5710 IdxVal &= ~(ElemsPerChunk - 1);
5711
5712 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5713 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5714}
5715
5716/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5717/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5718/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5719/// simple superregister reference. Idx is an index in the 128 bits
5720/// we want. It need not be aligned to a 128-bit boundary. That makes
5721/// lowering INSERT_VECTOR_ELT operations easier.
5722static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5723 SelectionDAG &DAG, const SDLoc &dl) {
5724 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5724, __PRETTY_FUNCTION__))
;
5725 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5726}
5727
5728/// Widen a vector to a larger size with the same scalar type, with the new
5729/// elements either zero or undef.
5730static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5731 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5732 const SDLoc &dl) {
5733 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5735, __PRETTY_FUNCTION__))
5734 Vec.getValueType().getScalarType() == VT.getScalarType() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5735, __PRETTY_FUNCTION__))
5735 "Unsupported vector widening type")((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5735, __PRETTY_FUNCTION__))
;
5736 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5737 : DAG.getUNDEF(VT);
5738 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5739 DAG.getIntPtrConstant(0, dl));
5740}
5741
5742/// Widen a vector to a larger size with the same scalar type, with the new
5743/// elements either zero or undef.
5744static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5745 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5746 const SDLoc &dl, unsigned WideSizeInBits) {
5747 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5749, __PRETTY_FUNCTION__))
5748 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5749, __PRETTY_FUNCTION__))
5749 "Unsupported vector widening type")((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5749, __PRETTY_FUNCTION__))
;
5750 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5751 MVT SVT = Vec.getSimpleValueType().getScalarType();
5752 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5753 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5754}
5755
5756// Helper function to collect subvector ops that are concatenated together,
5757// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5758// The subvectors in Ops are guaranteed to be the same type.
5759static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5760 assert(Ops.empty() && "Expected an empty ops vector")((Ops.empty() && "Expected an empty ops vector") ? static_cast
<void> (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5760, __PRETTY_FUNCTION__))
;
5761
5762 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5763 Ops.append(N->op_begin(), N->op_end());
5764 return true;
5765 }
5766
5767 if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
5768 isa<ConstantSDNode>(N->getOperand(2))) {
5769 SDValue Src = N->getOperand(0);
5770 SDValue Sub = N->getOperand(1);
5771 const APInt &Idx = N->getConstantOperandAPInt(2);
5772 EVT VT = Src.getValueType();
5773 EVT SubVT = Sub.getValueType();
5774
5775 // TODO - Handle more general insert_subvector chains.
5776 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
5777 Idx == (VT.getVectorNumElements() / 2) &&
5778 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5779 Src.getOperand(1).getValueType() == SubVT &&
5780 isNullConstant(Src.getOperand(2))) {
5781 Ops.push_back(Src.getOperand(1));
5782 Ops.push_back(Sub);
5783 return true;
5784 }
5785 }
5786
5787 return false;
5788}
5789
5790// Helper for splitting operands of an operation to legal target size and
5791// apply a function on each part.
5792// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5793// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5794// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5795// The argument Builder is a function that will be applied on each split part:
5796// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5797template <typename F>
5798SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5799 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5800 F Builder, bool CheckBWI = true) {
5801 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5801, __PRETTY_FUNCTION__))
;
5802 unsigned NumSubs = 1;
5803 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5804 (!CheckBWI && Subtarget.useAVX512Regs())) {
5805 if (VT.getSizeInBits() > 512) {
5806 NumSubs = VT.getSizeInBits() / 512;
5807 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5807, __PRETTY_FUNCTION__))
;
5808 }
5809 } else if (Subtarget.hasAVX2()) {
5810 if (VT.getSizeInBits() > 256) {
5811 NumSubs = VT.getSizeInBits() / 256;
5812 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5812, __PRETTY_FUNCTION__))
;
5813 }
5814 } else {
5815 if (VT.getSizeInBits() > 128) {
5816 NumSubs = VT.getSizeInBits() / 128;
5817 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5817, __PRETTY_FUNCTION__))
;
5818 }
5819 }
5820
5821 if (NumSubs == 1)
5822 return Builder(DAG, DL, Ops);
5823
5824 SmallVector<SDValue, 4> Subs;
5825 for (unsigned i = 0; i != NumSubs; ++i) {
5826 SmallVector<SDValue, 2> SubOps;
5827 for (SDValue Op : Ops) {
5828 EVT OpVT = Op.getValueType();
5829 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5830 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5831 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5832 }
5833 Subs.push_back(Builder(DAG, DL, SubOps));
5834 }
5835 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5836}
5837
5838/// Insert i1-subvector to i1-vector.
5839static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5840 const X86Subtarget &Subtarget) {
5841
5842 SDLoc dl(Op);
5843 SDValue Vec = Op.getOperand(0);
5844 SDValue SubVec = Op.getOperand(1);
5845 SDValue Idx = Op.getOperand(2);
5846
5847 if (!isa<ConstantSDNode>(Idx))
5848 return SDValue();
5849
5850 // Inserting undef is a nop. We can just return the original vector.
5851 if (SubVec.isUndef())
5852 return Vec;
5853
5854 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5855 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5856 return Op;
5857
5858 MVT OpVT = Op.getSimpleValueType();
5859 unsigned NumElems = OpVT.getVectorNumElements();
5860
5861 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5862
5863 // Extend to natively supported kshift.
5864 MVT WideOpVT = OpVT;
5865 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5866 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5867
5868 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5869 // if necessary.
5870 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5871 // May need to promote to a legal type.
5872 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5873 DAG.getConstant(0, dl, WideOpVT),
5874 SubVec, Idx);
5875 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5876 }
5877
5878 MVT SubVecVT = SubVec.getSimpleValueType();
5879 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5880
5881 assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5883, __PRETTY_FUNCTION__))
5882 IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5883, __PRETTY_FUNCTION__))
5883 "Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5883, __PRETTY_FUNCTION__))
;
5884
5885 SDValue Undef = DAG.getUNDEF(WideOpVT);
5886
5887 if (IdxVal == 0) {
5888 // Zero lower bits of the Vec
5889 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
5890 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5891 ZeroIdx);
5892 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5893 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5894 // Merge them together, SubVec should be zero extended.
5895 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5896 DAG.getConstant(0, dl, WideOpVT),
5897 SubVec, ZeroIdx);
5898 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5899 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5900 }
5901
5902 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5903 Undef, SubVec, ZeroIdx);
5904
5905 if (Vec.isUndef()) {
5906 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5906, __PRETTY_FUNCTION__))
;
5907 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5908 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5910 }
5911
5912 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5913 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5913, __PRETTY_FUNCTION__))
;
5914 NumElems = WideOpVT.getVectorNumElements();
5915 unsigned ShiftLeft = NumElems - SubVecNumElems;
5916 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5917 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5918 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5919 if (ShiftRight != 0)
5920 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5921 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5923 }
5924
5925 // Simple case when we put subvector in the upper part
5926 if (IdxVal + SubVecNumElems == NumElems) {
5927 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5928 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5929 if (SubVecNumElems * 2 == NumElems) {
5930 // Special case, use legal zero extending insert_subvector. This allows
5931 // isel to optimize when bits are known zero.
5932 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5933 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5934 DAG.getConstant(0, dl, WideOpVT),
5935 Vec, ZeroIdx);
5936 } else {
5937 // Otherwise use explicit shifts to zero the bits.
5938 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5939 Undef, Vec, ZeroIdx);
5940 NumElems = WideOpVT.getVectorNumElements();
5941 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
5942 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5943 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5944 }
5945 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5946 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5947 }
5948
5949 // Inserting into the middle is more complicated.
5950
5951 NumElems = WideOpVT.getVectorNumElements();
5952
5953 // Widen the vector if needed.
5954 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5955
5956 unsigned ShiftLeft = NumElems - SubVecNumElems;
5957 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5958
5959 // Do an optimization for the the most frequently used types.
5960 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
5961 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
5962 Mask0.flipAllBits();
5963 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
5964 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
5965 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
5966 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5967 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5968 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5969 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5970 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5971
5972 // Reduce to original width if needed.
5973 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5974 }
5975
5976 // Clear the upper bits of the subvector and move it to its insert position.
5977 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5978 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5979 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5980 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5981
5982 // Isolate the bits below the insertion point.
5983 unsigned LowShift = NumElems - IdxVal;
5984 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
5985 DAG.getTargetConstant(LowShift, dl, MVT::i8));
5986 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
5987 DAG.getTargetConstant(LowShift, dl, MVT::i8));
5988
5989 // Isolate the bits after the last inserted bit.
5990 unsigned HighShift = IdxVal + SubVecNumElems;
5991 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5992 DAG.getTargetConstant(HighShift, dl, MVT::i8));
5993 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
5994 DAG.getTargetConstant(HighShift, dl, MVT::i8));
5995
5996 // Now OR all 3 pieces together.
5997 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
5998 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
5999
6000 // Reduce to original width if needed.
6001 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6002}
6003
6004static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6005 const SDLoc &dl) {
6006 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((V1.getValueType() == V2.getValueType() && "subvector type mismatch"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6006, __PRETTY_FUNCTION__))
;
6007 EVT SubVT = V1.getValueType();
6008 EVT SubSVT = SubVT.getScalarType();
6009 unsigned SubNumElts = SubVT.getVectorNumElements();
6010 unsigned SubVectorWidth = SubVT.getSizeInBits();
6011 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6012 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6013 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6014}
6015
6016/// Returns a vector of specified type with all bits set.
6017/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6018/// Then bitcast to their original type, ensuring they get CSE'd.
6019static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6020 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6021, __PRETTY_FUNCTION__))
6021 "Expected a 128/256/512-bit vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6021, __PRETTY_FUNCTION__))
;
6022
6023 APInt Ones = APInt::getAllOnesValue(32);
6024 unsigned NumElts = VT.getSizeInBits() / 32;
6025 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6026 return DAG.getBitcast(VT, Vec);
6027}
6028
6029// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6030static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6031 switch (Opcode) {
6032 case ISD::ANY_EXTEND:
6033 case ISD::ANY_EXTEND_VECTOR_INREG:
6034 return ISD::ANY_EXTEND_VECTOR_INREG;
6035 case ISD::ZERO_EXTEND:
6036 case ISD::ZERO_EXTEND_VECTOR_INREG:
6037 return ISD::ZERO_EXTEND_VECTOR_INREG;
6038 case ISD::SIGN_EXTEND:
6039 case ISD::SIGN_EXTEND_VECTOR_INREG:
6040 return ISD::SIGN_EXTEND_VECTOR_INREG;
6041 }
6042 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6042)
;
6043}
6044
6045static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
6046 SDValue In, SelectionDAG &DAG) {
6047 EVT InVT = In.getValueType();
6048 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((VT.isVector() && InVT.isVector() && "Expected vector VTs."
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6048, __PRETTY_FUNCTION__))
;
6049 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6051, __PRETTY_FUNCTION__))
6050 ISD::ZERO_EXTEND == Opcode) &&(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6051, __PRETTY_FUNCTION__))
6051 "Unknown extension opcode")(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6051, __PRETTY_FUNCTION__))
;
6052
6053 // For 256-bit vectors, we only need the lower (128-bit) input half.
6054 // For 512-bit vectors, we only need the lower input half or quarter.
6055 if (InVT.getSizeInBits() > 128) {
6056 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6057, __PRETTY_FUNCTION__))
6057 "Expected VTs to be the same size!")((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6057, __PRETTY_FUNCTION__))
;
6058 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6059 In = extractSubVector(In, 0, DAG, DL,
6060 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6061 InVT = In.getValueType();
6062 }
6063
6064 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6065 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6066
6067 return DAG.getNode(Opcode, DL, VT, In);
6068}
6069
6070// Match (xor X, -1) -> X.
6071// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6072// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6073static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6074 V = peekThroughBitcasts(V);
6075 if (V.getOpcode() == ISD::XOR &&
6076 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6077 return V.getOperand(0);
6078 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6079 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6080 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6081 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6082 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6083 Not, V.getOperand(1));
6084 }
6085 }
6086 SmallVector<SDValue, 2> CatOps;
6087 if (collectConcatOps(V.getNode(), CatOps)) {
6088 for (SDValue &CatOp : CatOps) {
6089 SDValue NotCat = IsNOT(CatOp, DAG);
6090 if (!NotCat) return SDValue();
6091 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6092 }
6093 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6094 }
6095 return SDValue();
6096}
6097
6098/// Returns a vector_shuffle node for an unpackl operation.
6099static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6100 SDValue V1, SDValue V2) {
6101 SmallVector<int, 8> Mask;
6102 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6103 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6104}
6105
6106/// Returns a vector_shuffle node for an unpackh operation.
6107static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6108 SDValue V1, SDValue V2) {
6109 SmallVector<int, 8> Mask;
6110 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6111 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6112}
6113
6114/// Return a vector_shuffle of the specified vector of zero or undef vector.
6115/// This produces a shuffle where the low element of V2 is swizzled into the
6116/// zero/undef vector, landing at element Idx.
6117/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6118static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6119 bool IsZero,
6120 const X86Subtarget &Subtarget,
6121 SelectionDAG &DAG) {
6122 MVT VT = V2.getSimpleValueType();
6123 SDValue V1 = IsZero
6124 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6125 int NumElems = VT.getVectorNumElements();
6126 SmallVector<int, 16> MaskVec(NumElems);
6127 for (int i = 0; i != NumElems; ++i)
6128 // If this is the insertion idx, put the low elt of V2 here.
6129 MaskVec[i] = (i == Idx) ? NumElems : i;
6130 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6131}
6132
6133static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6134 if (!Load || !ISD::isNormalLoad(Load))
6135 return nullptr;
6136
6137 SDValue Ptr = Load->getBasePtr();
6138 if (Ptr->getOpcode() == X86ISD::Wrapper ||
6139 Ptr->getOpcode() == X86ISD::WrapperRIP)
6140 Ptr = Ptr->getOperand(0);
6141
6142 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6143 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6144 return nullptr;
6145
6146 return CNode->getConstVal();
6147}
6148
6149static const Constant *getTargetConstantFromNode(SDValue Op) {
6150 Op = peekThroughBitcasts(Op);
6151 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6152}
6153
6154const Constant *
6155X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6156 assert(LD && "Unexpected null LoadSDNode")((LD && "Unexpected null LoadSDNode") ? static_cast<
void> (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6156, __PRETTY_FUNCTION__))
;
6157 return getTargetConstantFromNode(LD);
6158}
6159
6160// Extract raw constant bits from constant pools.
6161static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6162 APInt &UndefElts,
6163 SmallVectorImpl<APInt> &EltBits,
6164 bool AllowWholeUndefs = true,
6165 bool AllowPartialUndefs = true) {
6166 assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector"
) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6166, __PRETTY_FUNCTION__))
;
6167
6168 Op = peekThroughBitcasts(Op);
6169
6170 EVT VT = Op.getValueType();
6171 unsigned SizeInBits = VT.getSizeInBits();
6172 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"
) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6172, __PRETTY_FUNCTION__))
;
6173 unsigned NumElts = SizeInBits / EltSizeInBits;
6174
6175 // Bitcast a source array of element bits to the target size.
6176 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6177 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6178 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6179 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6180, __PRETTY_FUNCTION__))
6180 "Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6180, __PRETTY_FUNCTION__))
;
6181
6182 // Don't split if we don't allow undef bits.
6183 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6184 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6185 return false;
6186
6187 // If we're already the right size, don't bother bitcasting.
6188 if (NumSrcElts == NumElts) {
6189 UndefElts = UndefSrcElts;
6190 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6191 return true;
6192 }
6193
6194 // Extract all the undef/constant element data and pack into single bitsets.
6195 APInt UndefBits(SizeInBits, 0);
6196 APInt MaskBits(SizeInBits, 0);
6197
6198 for (unsigned i = 0; i != NumSrcElts; ++i) {
6199 unsigned BitOffset = i * SrcEltSizeInBits;
6200 if (UndefSrcElts[i])
6201 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6202 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6203 }
6204
6205 // Split the undef/constant single bitset data into the target elements.
6206 UndefElts = APInt(NumElts, 0);
6207 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6208
6209 for (unsigned i = 0; i != NumElts; ++i) {
6210 unsigned BitOffset = i * EltSizeInBits;
6211 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6212
6213 // Only treat an element as UNDEF if all bits are UNDEF.
6214 if (UndefEltBits.isAllOnesValue()) {
6215 if (!AllowWholeUndefs)
6216 return false;
6217 UndefElts.setBit(i);
6218 continue;
6219 }
6220
6221 // If only some bits are UNDEF then treat them as zero (or bail if not
6222 // supported).
6223 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6224 return false;
6225
6226 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6227 }
6228 return true;
6229 };
6230
6231 // Collect constant bits and insert into mask/undef bit masks.
6232 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6233 unsigned UndefBitIndex) {
6234 if (!Cst)
6235 return false;
6236 if (isa<UndefValue>(Cst)) {
6237 Undefs.setBit(UndefBitIndex);
6238 return true;
6239 }
6240 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6241 Mask = CInt->getValue();
6242 return true;
6243 }
6244 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6245 Mask = CFP->getValueAPF().bitcastToAPInt();
6246 return true;
6247 }
6248 return false;
6249 };
6250
6251 // Handle UNDEFs.
6252 if (Op.isUndef()) {
6253 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6254 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6255 return CastBitData(UndefSrcElts, SrcEltBits);
6256 }
6257
6258 // Extract scalar constant bits.
6259 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6260 APInt UndefSrcElts = APInt::getNullValue(1);
6261 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6262 return CastBitData(UndefSrcElts, SrcEltBits);
6263 }
6264 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6265 APInt UndefSrcElts = APInt::getNullValue(1);
6266 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6267 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6268 return CastBitData(UndefSrcElts, SrcEltBits);
6269 }
6270
6271 // Extract constant bits from build vector.
6272 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6273 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6274 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6275
6276 APInt UndefSrcElts(NumSrcElts, 0);
6277 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6278 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6279 const SDValue &Src = Op.getOperand(i);
6280 if (Src.isUndef()) {
6281 UndefSrcElts.setBit(i);
6282 continue;
6283 }
6284 auto *Cst = cast<ConstantSDNode>(Src);
6285 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6286 }
6287 return CastBitData(UndefSrcElts, SrcEltBits);
6288 }
6289 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6290 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6291 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6292
6293 APInt UndefSrcElts(NumSrcElts, 0);
6294 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6295 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6296 const SDValue &Src = Op.getOperand(i);
6297 if (Src.isUndef()) {
6298 UndefSrcElts.setBit(i);
6299 continue;
6300 }
6301 auto *Cst = cast<ConstantFPSDNode>(Src);
6302 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6303 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6304 }
6305 return CastBitData(UndefSrcElts, SrcEltBits);
6306 }
6307
6308 // Extract constant bits from constant pool vector.
6309 if (auto *Cst = getTargetConstantFromNode(Op)) {
6310 Type *CstTy = Cst->getType();
6311 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6312 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6313 return false;
6314
6315 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6316 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6317
6318 APInt UndefSrcElts(NumSrcElts, 0);
6319 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6320 for (unsigned i = 0; i != NumSrcElts; ++i)
6321 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6322 UndefSrcElts, i))
6323 return false;
6324
6325 return CastBitData(UndefSrcElts, SrcEltBits);
6326 }
6327
6328 // Extract constant bits from a broadcasted constant pool scalar.
6329 if (Op.getOpcode() == X86ISD::VBROADCAST &&
6330 EltSizeInBits <= VT.getScalarSizeInBits()) {
6331 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
6332 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
6333 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6334
6335 APInt UndefSrcElts(NumSrcElts, 0);
6336 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6337 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
6338 if (UndefSrcElts[0])
6339 UndefSrcElts.setBits(0, NumSrcElts);
6340 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6341 return CastBitData(UndefSrcElts, SrcEltBits);
6342 }
6343 }
6344 }
6345
6346 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6347 EltSizeInBits <= VT.getScalarSizeInBits()) {
6348 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6349 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6350 return false;
6351
6352 SDValue Ptr = MemIntr->getBasePtr();
6353 if (Ptr->getOpcode() == X86ISD::Wrapper ||
6354 Ptr->getOpcode() == X86ISD::WrapperRIP)
6355 Ptr = Ptr->getOperand(0);
6356
6357 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6358 if (!CNode || CNode->isMachineConstantPoolEntry() ||
6359 CNode->getOffset() != 0)
6360 return false;
6361
6362 if (const Constant *C = CNode->getConstVal()) {
6363 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6364 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6365
6366 APInt UndefSrcElts(NumSrcElts, 0);
6367 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6368 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6369 if (UndefSrcElts[0])
6370 UndefSrcElts.setBits(0, NumSrcElts);
6371 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6372 return CastBitData(UndefSrcElts, SrcEltBits);
6373 }
6374 }
6375 }
6376
6377 // Extract constant bits from a subvector broadcast.
6378 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
6379 SmallVector<APInt, 16> SubEltBits;
6380 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6381 UndefElts, SubEltBits, AllowWholeUndefs,
6382 AllowPartialUndefs)) {
6383 UndefElts = APInt::getSplat(NumElts, UndefElts);
6384 while (EltBits.size() < NumElts)
6385 EltBits.append(SubEltBits.begin(), SubEltBits.end());
6386 return true;
6387 }
6388 }
6389
6390 // Extract a rematerialized scalar constant insertion.
6391 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6392 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6393 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6394 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6395 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6396
6397 APInt UndefSrcElts(NumSrcElts, 0);
6398 SmallVector<APInt, 64> SrcEltBits;
6399 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6400 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6401 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6402 return CastBitData(UndefSrcElts, SrcEltBits);
6403 }
6404
6405 // Insert constant bits from a base and sub vector sources.
6406 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
6407 isa<ConstantSDNode>(Op.getOperand(2))) {
6408 // TODO - support insert_subvector through bitcasts.
6409 if (EltSizeInBits != VT.getScalarSizeInBits())
6410 return false;
6411
6412 APInt UndefSubElts;
6413 SmallVector<APInt, 32> EltSubBits;
6414 if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6415 UndefSubElts, EltSubBits,
6416 AllowWholeUndefs, AllowPartialUndefs) &&
6417 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6418 UndefElts, EltBits, AllowWholeUndefs,
6419 AllowPartialUndefs)) {
6420 unsigned BaseIdx = Op.getConstantOperandVal(2);
6421 UndefElts.insertBits(UndefSubElts, BaseIdx);
6422 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6423 EltBits[BaseIdx + i] = EltSubBits[i];
6424 return true;
6425 }
6426 }
6427
6428 // Extract constant bits from a subvector's source.
6429 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6430 isa<ConstantSDNode>(Op.getOperand(1))) {
6431 // TODO - support extract_subvector through bitcasts.
6432 if (EltSizeInBits != VT.getScalarSizeInBits())
6433 return false;
6434
6435 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6436 UndefElts, EltBits, AllowWholeUndefs,
6437 AllowPartialUndefs)) {
6438 EVT SrcVT = Op.getOperand(0).getValueType();
6439 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6440 unsigned NumSubElts = VT.getVectorNumElements();
6441 unsigned BaseIdx = Op.getConstantOperandVal(1);
6442 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6443 if ((BaseIdx + NumSubElts) != NumSrcElts)
6444 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6445 if (BaseIdx != 0)
6446 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6447 return true;
6448 }
6449 }
6450
6451 // Extract constant bits from shuffle node sources.
6452 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6453 // TODO - support shuffle through bitcasts.
6454 if (EltSizeInBits != VT.getScalarSizeInBits())
6455 return false;
6456
6457 ArrayRef<int> Mask = SVN->getMask();
6458 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6459 llvm::any_of(Mask, [](int M) { return M < 0; }))
6460 return false;
6461
6462 APInt UndefElts0, UndefElts1;
6463 SmallVector<APInt, 32> EltBits0, EltBits1;
6464 if (isAnyInRange(Mask, 0, NumElts) &&
6465 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6466 UndefElts0, EltBits0, AllowWholeUndefs,
6467 AllowPartialUndefs))
6468 return false;
6469 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6470 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6471 UndefElts1, EltBits1, AllowWholeUndefs,
6472 AllowPartialUndefs))
6473 return false;
6474
6475 UndefElts = APInt::getNullValue(NumElts);
6476 for (int i = 0; i != (int)NumElts; ++i) {
6477 int M = Mask[i];
6478 if (M < 0) {
6479 UndefElts.setBit(i);
6480 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6481 } else if (M < (int)NumElts) {
6482 if (UndefElts0[M])
6483 UndefElts.setBit(i);
6484 EltBits.push_back(EltBits0[M]);
6485 } else {
6486 if (UndefElts1[M - NumElts])
6487 UndefElts.setBit(i);
6488 EltBits.push_back(EltBits1[M - NumElts]);
6489 }
6490 }
6491 return true;
6492 }
6493
6494 return false;
6495}
6496
6497namespace llvm {
6498namespace X86 {
6499bool isConstantSplat(SDValue Op, APInt &SplatVal) {
6500 APInt UndefElts;
6501 SmallVector<APInt, 16> EltBits;
6502 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6503 UndefElts, EltBits, true, false)) {
6504 int SplatIndex = -1;
6505 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6506 if (UndefElts[i])
6507 continue;
6508 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6509 SplatIndex = -1;
6510 break;
6511 }
6512 SplatIndex = i;
6513 }
6514 if (0 <= SplatIndex) {
6515 SplatVal = EltBits[SplatIndex];
6516 return true;
6517 }
6518 }
6519
6520 return false;
6521}
6522} // namespace X86
6523} // namespace llvm
6524
6525static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6526 unsigned MaskEltSizeInBits,
6527 SmallVectorImpl<uint64_t> &RawMask,
6528 APInt &UndefElts) {
6529 // Extract the raw target constant bits.
6530 SmallVector<APInt, 64> EltBits;
6531 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6532 EltBits, /* AllowWholeUndefs */ true,
6533 /* AllowPartialUndefs */ false))
6534 return false;
6535
6536 // Insert the extracted elements into the mask.
6537 for (APInt Elt : EltBits)
6538 RawMask.push_back(Elt.getZExtValue());
6539
6540 return true;
6541}
6542
6543/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6544/// Note: This ignores saturation, so inputs must be checked first.
6545static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6546 bool Unary) {
6547 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6547, __PRETTY_FUNCTION__))
;
6548 unsigned NumElts = VT.getVectorNumElements();
6549 unsigned NumLanes = VT.getSizeInBits() / 128;
6550 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6551 unsigned Offset = Unary ? 0 : NumElts;
6552
6553 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6554 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6555 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6556 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6557 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6558 }
6559}
6560
6561// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6562static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6563 APInt &DemandedLHS, APInt &DemandedRHS) {
6564 int NumLanes = VT.getSizeInBits() / 128;
6565 int NumElts = DemandedElts.getBitWidth();
6566 int NumInnerElts = NumElts / 2;
6567 int NumEltsPerLane = NumElts / NumLanes;
6568 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6569
6570 DemandedLHS = APInt::getNullValue(NumInnerElts);
6571 DemandedRHS = APInt::getNullValue(NumInnerElts);
6572
6573 // Map DemandedElts to the packed operands.
6574 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6575 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6576 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6577 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6578 if (DemandedElts[OuterIdx])
6579 DemandedLHS.setBit(InnerIdx);
6580 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6581 DemandedRHS.setBit(InnerIdx);
6582 }
6583 }
6584}
6585
6586// Split the demanded elts of a HADD/HSUB node between its operands.
6587static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6588 APInt &DemandedLHS, APInt &DemandedRHS) {
6589 int NumLanes = VT.getSizeInBits() / 128;
6590 int NumElts = DemandedElts.getBitWidth();
6591 int NumEltsPerLane = NumElts / NumLanes;
6592 int HalfEltsPerLane = NumEltsPerLane / 2;
6593
6594 DemandedLHS = APInt::getNullValue(NumElts);
6595 DemandedRHS = APInt::getNullValue(NumElts);
6596
6597 // Map DemandedElts to the horizontal operands.
6598 for (int Idx = 0; Idx != NumElts; ++Idx) {
6599 if (!DemandedElts[Idx])
6600 continue;
6601 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6602 int LocalIdx = Idx % NumEltsPerLane;
6603 if (LocalIdx < HalfEltsPerLane) {
6604 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6605 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6606 } else {
6607 LocalIdx -= HalfEltsPerLane;
6608 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6609 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6610 }
6611 }
6612}
6613
6614/// Calculates the shuffle mask corresponding to the target-specific opcode.
6615/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6616/// operands in \p Ops, and returns true.
6617/// Sets \p IsUnary to true if only one source is used. Note that this will set
6618/// IsUnary for shuffles which use a single input multiple times, and in those
6619/// cases it will adjust the mask to only have indices within that single input.
6620/// It is an error to call this with non-empty Mask/Ops vectors.
6621static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6622 SmallVectorImpl<SDValue> &Ops,
6623 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6624 unsigned NumElems = VT.getVectorNumElements();
6625 unsigned MaskEltSize = VT.getScalarSizeInBits();
6626 SmallVector<uint64_t, 32> RawMask;
6627 APInt RawUndefs;
6628 uint64_t ImmN;
6629
6630 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6630, __PRETTY_FUNCTION__))
;
6631 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6631, __PRETTY_FUNCTION__))
;
6632
6633 IsUnary = false;
6634 bool IsFakeUnary = false;
6635 switch (N->getOpcode()) {
6636 case X86ISD::BLENDI:
6637 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6637, __PRETTY_FUNCTION__))
;
6638 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6638, __PRETTY_FUNCTION__))
;
6639 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6640 DecodeBLENDMask(NumElems, ImmN, Mask);
6641 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6642 break;
6643 case X86ISD::SHUFP:
6644 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6644, __PRETTY_FUNCTION__))
;
6645 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6645, __PRETTY_FUNCTION__))
;
6646 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6647 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
6648 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6649 break;
6650 case X86ISD::INSERTPS:
6651 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6651, __PRETTY_FUNCTION__))
;
6652 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6652, __PRETTY_FUNCTION__))
;
6653 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6654 DecodeINSERTPSMask(ImmN, Mask);
6655 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6656 break;
6657 case X86ISD::EXTRQI:
6658 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6658, __PRETTY_FUNCTION__))
;
6659 if (isa<ConstantSDNode>(N->getOperand(1)) &&
6660 isa<ConstantSDNode>(N->getOperand(2))) {
6661 int BitLen = N->getConstantOperandVal(1);
6662 int BitIdx = N->getConstantOperandVal(2);
6663 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6664 IsUnary = true;
6665 }
6666 break;
6667 case X86ISD::INSERTQI:
6668 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6668, __PRETTY_FUNCTION__))
;
6669 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6669, __PRETTY_FUNCTION__))
;
6670 if (isa<ConstantSDNode>(N->getOperand(2)) &&
6671 isa<ConstantSDNode>(N->getOperand(3))) {
6672 int BitLen = N->getConstantOperandVal(2);
6673 int BitIdx = N->getConstantOperandVal(3);
6674 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6675 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6676 }
6677 break;
6678 case X86ISD::UNPCKH:
6679 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6679, __PRETTY_FUNCTION__))
;
6680 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6680, __PRETTY_FUNCTION__))
;
6681 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
6682 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6683 break;
6684 case X86ISD::UNPCKL:
6685 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6685, __PRETTY_FUNCTION__))
;
6686 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6686, __PRETTY_FUNCTION__))
;
6687 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
6688 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6689 break;
6690 case X86ISD::MOVHLPS:
6691 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6691, __PRETTY_FUNCTION__))
;
6692 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6692, __PRETTY_FUNCTION__))
;
6693 DecodeMOVHLPSMask(NumElems, Mask);
6694 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6695 break;
6696 case X86ISD::MOVLHPS:
6697 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6697, __PRETTY_FUNCTION__))
;
6698 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6698, __PRETTY_FUNCTION__))
;
6699 DecodeMOVLHPSMask(NumElems, Mask);
6700 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6701 break;
6702 case X86ISD::PALIGNR:
6703 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6703, __PRETTY_FUNCTION__))
;
6704 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6704, __PRETTY_FUNCTION__))
;
6705 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6705, __PRETTY_FUNCTION__))
;
6706 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6707 DecodePALIGNRMask(NumElems, ImmN, Mask);
6708 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6709 Ops.push_back(N->getOperand(1));
6710 Ops.push_back(N->getOperand(0));
6711 break;
6712 case X86ISD::VSHLDQ:
6713 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6713, __PRETTY_FUNCTION__))
;
6714 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6714, __PRETTY_FUNCTION__))
;
6715 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6716 DecodePSLLDQMask(NumElems, ImmN, Mask);
6717 IsUnary = true;
6718 break;
6719 case X86ISD::VSRLDQ:
6720 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6720, __PRETTY_FUNCTION__))
;
6721 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6721, __PRETTY_FUNCTION__))
;
6722 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6723 DecodePSRLDQMask(NumElems, ImmN, Mask);
6724 IsUnary = true;
6725 break;
6726 case X86ISD::PSHUFD:
6727 case X86ISD::VPERMILPI:
6728 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6728, __PRETTY_FUNCTION__))
;
6729 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6730 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
6731 IsUnary = true;
6732 break;
6733 case X86ISD::PSHUFHW:
6734 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6734, __PRETTY_FUNCTION__))
;
6735 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6736 DecodePSHUFHWMask(NumElems, ImmN, Mask);
6737 IsUnary = true;
6738 break;
6739 case X86ISD::PSHUFLW:
6740 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6740, __PRETTY_FUNCTION__))
;
6741 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6742 DecodePSHUFLWMask(NumElems, ImmN, Mask);
6743 IsUnary = true;
6744 break;
6745 case X86ISD::VZEXT_MOVL:
6746 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6746, __PRETTY_FUNCTION__))
;
6747 DecodeZeroMoveLowMask(NumElems, Mask);
6748 IsUnary = true;
6749 break;
6750 case X86ISD::VBROADCAST: {
6751 SDValue N0 = N->getOperand(0);
6752 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
6753 // add the pre-extracted value to the Ops vector.
6754 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6755 N0.getOperand(0).getValueType() == VT &&
6756 N0.getConstantOperandVal(1) == 0)
6757 Ops.push_back(N0.getOperand(0));
6758
6759 // We only decode broadcasts of same-sized vectors, unless the broadcast
6760 // came from an extract from the original width. If we found one, we
6761 // pushed it the Ops vector above.
6762 if (N0.getValueType() == VT || !Ops.empty()) {
6763 DecodeVectorBroadcast(NumElems, Mask);
6764 IsUnary = true;
6765 break;
6766 }
6767 return false;
6768 }
6769 case X86ISD::VPERMILPV: {
6770 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6770, __PRETTY_FUNCTION__))
;
6771 IsUnary = true;
6772 SDValue MaskNode = N->getOperand(1);
6773 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6774 RawUndefs)) {
6775 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6776 break;
6777 }
6778 return false;
6779 }
6780 case X86ISD::PSHUFB: {
6781 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6781, __PRETTY_FUNCTION__))
;
6782 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6782, __PRETTY_FUNCTION__))
;
6783 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6783, __PRETTY_FUNCTION__))
;
6784 IsUnary = true;
6785 SDValue MaskNode = N->getOperand(1);
6786 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6787 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6788 break;
6789 }
6790 return false;
6791 }
6792 case X86ISD::VPERMI:
6793 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6793, __PRETTY_FUNCTION__))
;
6794 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6795 DecodeVPERMMask(NumElems, ImmN, Mask);
6796 IsUnary = true;
6797 break;
6798 case X86ISD::MOVSS:
6799 case X86ISD::MOVSD:
6800 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6800, __PRETTY_FUNCTION__))
;
6801 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6801, __PRETTY_FUNCTION__))
;
6802 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6803 break;
6804 case X86ISD::VPERM2X128:
6805 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6805, __PRETTY_FUNCTION__))
;
6806 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6806, __PRETTY_FUNCTION__))
;
6807 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6808 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
6809 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6810 break;
6811 case X86ISD::SHUF128:
6812 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6812, __PRETTY_FUNCTION__))
;
6813 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6813, __PRETTY_FUNCTION__))
;
6814 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6815 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
6816 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6817 break;
6818 case X86ISD::MOVSLDUP:
6819 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6819, __PRETTY_FUNCTION__))
;
6820 DecodeMOVSLDUPMask(NumElems, Mask);
6821 IsUnary = true;
6822 break;
6823 case X86ISD::MOVSHDUP:
6824 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6824, __PRETTY_FUNCTION__))
;
6825 DecodeMOVSHDUPMask(NumElems, Mask);
6826 IsUnary = true;
6827 break;
6828 case X86ISD::MOVDDUP:
6829 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6829, __PRETTY_FUNCTION__))
;
6830 DecodeMOVDDUPMask(NumElems, Mask);
6831 IsUnary = true;
6832 break;
6833 case X86ISD::VPERMIL2: {
6834 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6834, __PRETTY_FUNCTION__))
;
6835 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6835, __PRETTY_FUNCTION__))
;
6836 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6837 SDValue MaskNode = N->getOperand(2);
6838 SDValue CtrlNode = N->getOperand(3);
6839 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6840 unsigned CtrlImm = CtrlOp->getZExtValue();
6841 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6842 RawUndefs)) {
6843 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6844 Mask);
6845 break;
6846 }
6847 }
6848 return false;
6849 }
6850 case X86ISD::VPPERM: {
6851 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6851, __PRETTY_FUNCTION__))
;
6852 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6852, __PRETTY_FUNCTION__))
;
6853 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6854 SDValue MaskNode = N->getOperand(2);
6855 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6856 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6857 break;
6858 }
6859 return false;
6860 }
6861 case X86ISD::VPERMV: {
6862 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6862, __PRETTY_FUNCTION__))
;
6863 IsUnary = true;
6864 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6865 Ops.push_back(N->getOperand(1));
6866 SDValue MaskNode = N->getOperand(0);
6867 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6868 RawUndefs)) {
6869 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6870 break;
6871 }
6872 return false;
6873 }
6874 case X86ISD::VPERMV3: {
6875 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6875, __PRETTY_FUNCTION__))
;
6876 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((N->getOperand(2).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6876, __PRETTY_FUNCTION__))
;
6877 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6878 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6879 Ops.push_back(N->getOperand(0));
6880 Ops.push_back(N->getOperand(2));
6881 SDValue MaskNode = N->getOperand(1);
6882 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6883 RawUndefs)) {
6884 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6885 break;
6886 }
6887 return false;
6888 }
6889 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6889)
;
6890 }
6891
6892 // Empty mask indicates the decode failed.
6893 if (Mask.empty())
6894 return false;
6895
6896 // Check if we're getting a shuffle mask with zero'd elements.
6897 if (!AllowSentinelZero)
6898 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6899 return false;
6900
6901 // If we have a fake unary shuffle, the shuffle mask is spread across two
6902 // inputs that are actually the same node. Re-map the mask to always point
6903 // into the first input.
6904 if (IsFakeUnary)
6905 for (int &M : Mask)
6906 if (M >= (int)Mask.size())
6907 M -= Mask.size();
6908
6909 // If we didn't already add operands in the opcode-specific code, default to
6910 // adding 1 or 2 operands starting at 0.
6911 if (Ops.empty()) {
6912 Ops.push_back(N->getOperand(0));
6913 if (!IsUnary || IsFakeUnary)
6914 Ops.push_back(N->getOperand(1));
6915 }
6916
6917 return true;
6918}
6919
6920/// Compute whether each element of a shuffle is zeroable.
6921///
6922/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6923/// Either it is an undef element in the shuffle mask, the element of the input
6924/// referenced is undef, or the element of the input referenced is known to be
6925/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6926/// as many lanes with this technique as possible to simplify the remaining
6927/// shuffle.
6928static void computeZeroableShuffleElements(ArrayRef<int> Mask,
6929 SDValue V1, SDValue V2,
6930 APInt &KnownUndef, APInt &KnownZero) {
6931 int Size = Mask.size();
6932 KnownUndef = KnownZero = APInt::getNullValue(Size);
6933
6934 V1 = peekThroughBitcasts(V1);
6935 V2 = peekThroughBitcasts(V2);
6936
6937 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6938 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6939
6940 int VectorSizeInBits = V1.getValueSizeInBits();
6941 int ScalarSizeInBits = VectorSizeInBits / Size;
6942 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6942, __PRETTY_FUNCTION__))
;
6943
6944 for (int i = 0; i < Size; ++i) {
6945 int M = Mask[i];
6946 // Handle the easy cases.
6947 if (M < 0) {
6948 KnownUndef.setBit(i);
6949 continue;
6950 }
6951 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6952 KnownZero.setBit(i);
6953 continue;
6954 }
6955
6956 // Determine shuffle input and normalize the mask.
6957 SDValue V = M < Size ? V1 : V2;
6958 M %= Size;
6959
6960 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6961 if (V.getOpcode() != ISD::BUILD_VECTOR)
6962 continue;
6963
6964 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6965 // the (larger) source element must be UNDEF/ZERO.
6966 if ((Size % V.getNumOperands()) == 0) {
6967 int Scale = Size / V->getNumOperands();
6968 SDValue Op = V.getOperand(M / Scale);
6969 if (Op.isUndef())
6970 KnownUndef.setBit(i);
6971 if (X86::isZeroNode(Op))
6972 KnownZero.setBit(i);
6973 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6974 APInt Val = Cst->getAPIntValue();
6975 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6976 if (Val == 0)
6977 KnownZero.setBit(i);
6978 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6979 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6980 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6981 if (Val == 0)
6982 KnownZero.setBit(i);
6983 }
6984 continue;
6985 }
6986
6987 // If the BUILD_VECTOR has more elements then all the (smaller) source
6988 // elements must be UNDEF or ZERO.
6989 if ((V.getNumOperands() % Size) == 0) {
6990 int Scale = V->getNumOperands() / Size;
6991 bool AllUndef = true;
6992 bool AllZero = true;
6993 for (int j = 0; j < Scale; ++j) {
6994 SDValue Op = V.getOperand((M * Scale) + j);
6995 AllUndef &= Op.isUndef();
6996 AllZero &= X86::isZeroNode(Op);
6997 }
6998 if (AllUndef)
6999 KnownUndef.setBit(i);
7000 if (AllZero)
7001 KnownZero.setBit(i);
7002 continue;
7003 }
7004 }
7005}
7006
7007/// Decode a target shuffle mask and inputs and see if any values are
7008/// known to be undef or zero from their inputs.
7009/// Returns true if the target shuffle mask was decoded.
7010/// FIXME: Merge this with computeZeroableShuffleElements?
7011static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7012 SmallVectorImpl<SDValue> &Ops,
7013 APInt &KnownUndef, APInt &KnownZero) {
7014 bool IsUnary;
7015 if (!isTargetShuffle(N.getOpcode()))
7016 return false;
7017
7018 MVT VT = N.getSimpleValueType();
7019 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7020 return false;
7021
7022 int Size = Mask.size();
7023 SDValue V1 = Ops[0];
7024 SDValue V2 = IsUnary ? V1 : Ops[1];
7025 KnownUndef = KnownZero = APInt::getNullValue(Size);
7026
7027 V1 = peekThroughBitcasts(V1);
7028 V2 = peekThroughBitcasts(V2);
7029
7030 assert((VT.getSizeInBits() % Size) == 0 &&(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7031, __PRETTY_FUNCTION__))
7031 "Illegal split of shuffle value type")(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7031, __PRETTY_FUNCTION__))
;
7032 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7033
7034 // Extract known constant input data.
7035 APInt UndefSrcElts[2];
7036 SmallVector<APInt, 32> SrcEltBits[2];
7037 bool IsSrcConstant[2] = {
7038 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7039 SrcEltBits[0], true, false),
7040 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7041 SrcEltBits[1], true, false)};
7042
7043 for (int i = 0; i < Size; ++i) {
7044 int M = Mask[i];
7045
7046 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7047 if (M < 0) {
7048 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7048, __PRETTY_FUNCTION__))
;
7049 if (SM_SentinelUndef == M)
7050 KnownUndef.setBit(i);
7051 if (SM_SentinelZero == M)
7052 KnownZero.setBit(i);
7053 continue;
7054 }
7055
7056 // Determine shuffle input and normalize the mask.
7057 unsigned SrcIdx = M / Size;
7058 SDValue V = M < Size ? V1 : V2;
7059 M %= Size;
7060
7061 // We are referencing an UNDEF input.
7062 if (V.isUndef()) {
7063 KnownUndef.setBit(i);
7064 continue;
7065 }
7066
7067 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7068 // TODO: We currently only set UNDEF for integer types - floats use the same
7069 // registers as vectors and many of the scalar folded loads rely on the
7070 // SCALAR_TO_VECTOR pattern.
7071 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7072 (Size % V.getValueType().getVectorNumElements()) == 0) {
7073 int Scale = Size / V.getValueType().getVectorNumElements();
7074 int Idx = M / Scale;
7075 if (Idx != 0 && !VT.isFloatingPoint())
7076 KnownUndef.setBit(i);
7077 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7078 KnownZero.setBit(i);
7079 continue;
7080 }
7081
7082 // Attempt to extract from the source's constant bits.
7083 if (IsSrcConstant[SrcIdx]) {
7084 if (UndefSrcElts[SrcIdx][M])
7085 KnownUndef.setBit(i);
7086 else if (SrcEltBits[SrcIdx][M] == 0)
7087 KnownZero.setBit(i);
7088 }
7089 }
7090
7091 assert(VT.getVectorNumElements() == (unsigned)Size &&((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7092, __PRETTY_FUNCTION__))
7092 "Different mask size from vector size!")((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7092, __PRETTY_FUNCTION__))
;
7093 return true;
7094}
7095
7096// Replace target shuffle mask elements with known undef/zero sentinels.
7097static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7098 const APInt &KnownUndef,
7099 const APInt &KnownZero,
7100 bool ResolveKnownZeros= true) {
7101 unsigned NumElts = Mask.size();
7102 assert(KnownUndef.getBitWidth() == NumElts &&((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7103, __PRETTY_FUNCTION__))
36
Assuming the condition is true
37
Assuming the condition is true
38
'?' condition is true
7103 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7103, __PRETTY_FUNCTION__))
;
7104
7105 for (unsigned i = 0; i != NumElts; ++i) {
39
Assuming 'i' is equal to 'NumElts'
40
Loop condition is false. Execution continues on line 7105
7106 if (KnownUndef[i])
7107 Mask[i] = SM_SentinelUndef;
7108 else if (ResolveKnownZeros && KnownZero[i])
7109 Mask[i] = SM_SentinelZero;
7110 }
7111}
41
Returning without writing to 'Mask.Size'
7112
7113// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7114static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7115 APInt &KnownUndef,
7116 APInt &KnownZero) {
7117 unsigned NumElts = Mask.size();
7118 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7119
7120 for (unsigned i = 0; i != NumElts; ++i) {
7121 int M = Mask[i];
7122 if (SM_SentinelUndef == M)
7123 KnownUndef.setBit(i);
7124 if (SM_SentinelZero == M)
7125 KnownZero.setBit(i);
7126 }
7127}
7128
7129// Forward declaration (for getFauxShuffleMask recursive check).
7130// TODO: Use DemandedElts variant.
7131static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7132 SmallVectorImpl<int> &Mask,
7133 SelectionDAG &DAG, unsigned Depth,
7134 bool ResolveKnownElts);
7135
7136// Attempt to decode ops that could be represented as a shuffle mask.
7137// The decoded shuffle mask may contain a different number of elements to the
7138// destination value type.
7139static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7140 SmallVectorImpl<int> &Mask,
7141 SmallVectorImpl<SDValue> &Ops,
7142 SelectionDAG &DAG, unsigned Depth,
7143 bool ResolveKnownElts) {
7144 Mask.clear();
7145 Ops.clear();
7146
7147 MVT VT = N.getSimpleValueType();
7148 unsigned NumElts = VT.getVectorNumElements();
7149 unsigned NumSizeInBits = VT.getSizeInBits();
7150 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7151 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7152 return false;
7153 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7153, __PRETTY_FUNCTION__))
;
7154
7155 unsigned Opcode = N.getOpcode();
7156 switch (Opcode) {
7157 case ISD::VECTOR_SHUFFLE: {
7158 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7159 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7160 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7161 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7162 Ops.push_back(N.getOperand(0));
7163 Ops.push_back(N.getOperand(1));
7164 return true;
7165 }
7166 return false;
7167 }
7168 case ISD::AND:
7169 case X86ISD::ANDNP: {
7170 // Attempt to decode as a per-byte mask.
7171 APInt UndefElts;
7172 SmallVector<APInt, 32> EltBits;
7173 SDValue N0 = N.getOperand(0);
7174 SDValue N1 = N.getOperand(1);
7175 bool IsAndN = (X86ISD::ANDNP == Opcode);
7176 uint64_t ZeroMask = IsAndN ? 255 : 0;
7177 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7178 return false;
7179 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7180 if (UndefElts[i]) {
7181 Mask.push_back(SM_SentinelUndef);
7182 continue;
7183 }
7184 const APInt &ByteBits = EltBits[i];
7185 if (ByteBits != 0 && ByteBits != 255)
7186 return false;
7187 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7188 }
7189 Ops.push_back(IsAndN ? N1 : N0);
7190 return true;
7191 }
7192 case ISD::OR: {
7193 // Inspect each operand at the byte level. We can merge these into a
7194 // blend shuffle mask if for each byte at least one is masked out (zero).
7195 KnownBits Known0 =
7196 DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
7197 KnownBits Known1 =
7198 DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
7199 if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
7200 bool IsByteMask = true;
7201 unsigned NumSizeInBytes = NumSizeInBits / 8;
7202 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7203 APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
7204 APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
7205 for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
7206 unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
7207 unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
7208 if (LHS == 255 && RHS == 0)
7209 SelectMask.setBit(i);
7210 else if (LHS == 255 && RHS == 255)
7211 ZeroMask.setBit(i);
7212 else if (!(LHS == 0 && RHS == 255))
7213 IsByteMask = false;
7214 }
7215 if (IsByteMask) {
7216 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
7217 for (unsigned j = 0; j != NumBytesPerElt; ++j) {
7218 unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
7219 int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
7220 Mask.push_back(Idx);
7221 }
7222 }
7223 Ops.push_back(N.getOperand(0));
7224 Ops.push_back(N.getOperand(1));
7225 return true;
7226 }
7227 }
7228
7229 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7230 // is a valid shuffle index.
7231 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
7232 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
7233 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7234 return false;
7235 SmallVector<int, 64> SrcMask0, SrcMask1;
7236 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7237 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7238 true) ||
7239 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7240 true))
7241 return false;
7242 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7243 SmallVector<int, 64> Mask0, Mask1;
7244 scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7245 scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7246 for (size_t i = 0; i != MaskSize; ++i) {
7247 if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
7248 Mask.push_back(SM_SentinelUndef);
7249 else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7250 Mask.push_back(SM_SentinelZero);
7251 else if (Mask1[i] == SM_SentinelZero)
7252 Mask.push_back(Mask0[i]);
7253 else if (Mask0[i] == SM_SentinelZero)
7254 Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
7255 else
7256 return false;
7257 }
7258 Ops.append(SrcInputs0.begin(), SrcInputs0.end());
7259 Ops.append(SrcInputs1.begin(), SrcInputs1.end());
7260 return true;
7261 }
7262 case ISD::INSERT_SUBVECTOR: {
7263 SDValue Src = N.getOperand(0);
7264 SDValue Sub = N.getOperand(1);
7265 EVT SubVT = Sub.getValueType();
7266 unsigned NumSubElts = SubVT.getVectorNumElements();
7267 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7268 !N->isOnlyUserOf(Sub.getNode()))
7269 return false;
7270 uint64_t InsertIdx = N.getConstantOperandVal(2);
7271 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7272 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7273 Sub.getOperand(0).getValueType() == VT &&
7274 isa<ConstantSDNode>(Sub.getOperand(1))) {
7275 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7276 for (int i = 0; i != (int)NumElts; ++i)
7277 Mask.push_back(i);
7278 for (int i = 0; i != (int)NumSubElts; ++i)
7279 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7280 Ops.push_back(Src);
7281 Ops.push_back(Sub.getOperand(0));
7282 return true;
7283 }
7284 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7285 SmallVector<int, 64> SubMask;
7286 SmallVector<SDValue, 2> SubInputs;
7287 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7288 SubMask, DAG, Depth + 1, ResolveKnownElts))
7289 return false;
7290 if (SubMask.size() != NumSubElts) {
7291 assert(((SubMask.size() % NumSubElts) == 0 ||((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7292, __PRETTY_FUNCTION__))
7292 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7292, __PRETTY_FUNCTION__))
;
7293 if ((NumSubElts % SubMask.size()) == 0) {
7294 int Scale = NumSubElts / SubMask.size();
7295 SmallVector<int,64> ScaledSubMask;
7296 scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
7297 SubMask = ScaledSubMask;
7298 } else {
7299 int Scale = SubMask.size() / NumSubElts;
7300 NumSubElts = SubMask.size();
7301 NumElts *= Scale;
7302 InsertIdx *= Scale;
7303 }
7304 }
7305 Ops.push_back(Src);
7306 for (SDValue &SubInput : SubInputs) {
7307 EVT SubSVT = SubInput.getValueType().getScalarType();
7308 EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
7309 NumSizeInBits / SubSVT.getSizeInBits());
7310 Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
7311 DAG.getUNDEF(AltVT), SubInput,
7312 DAG.getIntPtrConstant(0, SDLoc(N))));
7313 }
7314 for (int i = 0; i != (int)NumElts; ++i)
7315 Mask.push_back(i);
7316 for (int i = 0; i != (int)NumSubElts; ++i) {
7317 int M = SubMask[i];
7318 if (0 <= M) {
7319 int InputIdx = M / NumSubElts;
7320 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7321 }
7322 Mask[i + InsertIdx] = M;
7323 }
7324 return true;
7325 }
7326 case ISD::SCALAR_TO_VECTOR: {
7327 // Match against a scalar_to_vector of an extract from a vector,
7328 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
7329 SDValue N0 = N.getOperand(0);
7330 SDValue SrcExtract;
7331
7332 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7333 N0.getOperand(0).getValueType() == VT) ||
7334 (N0.getOpcode() == X86ISD::PEXTRW &&
7335 N0.getOperand(0).getValueType() == MVT::v8i16) ||
7336 (N0.getOpcode() == X86ISD::PEXTRB &&
7337 N0.getOperand(0).getValueType() == MVT::v16i8)) {
7338 SrcExtract = N0;
7339 }
7340
7341 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7342 return false;
7343
7344 SDValue SrcVec = SrcExtract.getOperand(0);
7345 EVT SrcVT = SrcVec.getValueType();
7346 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7347 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
7348
7349 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7350 if (NumSrcElts <= SrcIdx)
7351 return false;
7352
7353 Ops.push_back(SrcVec);
7354 Mask.push_back(SrcIdx);
7355 Mask.append(NumZeros, SM_SentinelZero);
7356 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
7357 return true;
7358 }
7359 case X86ISD::PINSRB:
7360 case X86ISD::PINSRW: {
7361 SDValue InVec = N.getOperand(0);
7362 SDValue InScl = N.getOperand(1);
7363 SDValue InIndex = N.getOperand(2);
7364 if (!isa<ConstantSDNode>(InIndex) ||
7365 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
7366 return false;
7367 uint64_t InIdx = N.getConstantOperandVal(2);
7368
7369 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
7370 if (X86::isZeroNode(InScl)) {
7371 Ops.push_back(InVec);
7372 for (unsigned i = 0; i != NumElts; ++i)
7373 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
7374 return true;
7375 }
7376
7377 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
7378 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
7379 unsigned ExOp =
7380 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
7381 if (InScl.getOpcode() != ExOp)
7382 return false;
7383
7384 SDValue ExVec = InScl.getOperand(0);
7385 SDValue ExIndex = InScl.getOperand(1);
7386 if (!isa<ConstantSDNode>(ExIndex) ||
7387 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
7388 return false;
7389 uint64_t ExIdx = InScl.getConstantOperandVal(1);
7390
7391 Ops.push_back(InVec);
7392 Ops.push_back(ExVec);
7393 for (unsigned i = 0; i != NumElts; ++i)
7394 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
7395 return true;
7396 }
7397 case X86ISD::PACKSS:
7398 case X86ISD::PACKUS: {
7399 SDValue N0 = N.getOperand(0);
7400 SDValue N1 = N.getOperand(1);
7401 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7403, __PRETTY_FUNCTION__))
7402 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7403, __PRETTY_FUNCTION__))
7403 "Unexpected input value type")((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7403, __PRETTY_FUNCTION__))
;
7404
7405 APInt EltsLHS, EltsRHS;
7406 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7407
7408 // If we know input saturation won't happen we can treat this
7409 // as a truncation shuffle.
7410 if (Opcode == X86ISD::PACKSS) {
7411 if ((!N0.isUndef() &&
7412 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7413 (!N1.isUndef() &&
7414 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7415 return false;
7416 } else {
7417 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7418 if ((!N0.isUndef() &&
7419 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7420 (!N1.isUndef() &&
7421 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7422 return false;
7423 }
7424
7425 bool IsUnary = (N0 == N1);
7426
7427 Ops.push_back(N0);
7428 if (!IsUnary)
7429 Ops.push_back(N1);
7430
7431 createPackShuffleMask(VT, Mask, IsUnary);
7432 return true;
7433 }
7434 case X86ISD::VSHLI:
7435 case X86ISD::VSRLI: {
7436 uint64_t ShiftVal = N.getConstantOperandVal(1);
7437 // Out of range bit shifts are guaranteed to be zero.
7438 if (NumBitsPerElt <= ShiftVal) {
7439 Mask.append(NumElts, SM_SentinelZero);
7440 return true;
7441 }
7442
7443 // We can only decode 'whole byte' bit shifts as shuffles.
7444 if ((ShiftVal % 8) != 0)
7445 break;
7446
7447 uint64_t ByteShift = ShiftVal / 8;
7448 unsigned NumBytes = NumSizeInBits / 8;
7449 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7450 Ops.push_back(N.getOperand(0));
7451
7452 // Clear mask to all zeros and insert the shifted byte indices.
7453 Mask.append(NumBytes, SM_SentinelZero);
7454
7455 if (X86ISD::VSHLI == Opcode) {
7456 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7457 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7458 Mask[i + j] = i + j - ByteShift;
7459 } else {
7460 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7461 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7462 Mask[i + j - ByteShift] = i + j;
7463 }
7464 return true;
7465 }
7466 case X86ISD::VROTLI:
7467 case X86ISD::VROTRI: {
7468 // We can only decode 'whole byte' bit rotates as shuffles.
7469 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7470 if ((RotateVal % 8) != 0)
7471 return false;
7472 Ops.push_back(N.getOperand(0));
7473 int NumBytesPerElt = NumBitsPerElt / 8;
7474 int Offset = RotateVal / 8;
7475 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7476 for (int i = 0; i != (int)NumElts; ++i) {
7477 int BaseIdx = i * NumBytesPerElt;
7478 for (int j = 0; j != NumBytesPerElt; ++j) {
7479 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7480 }
7481 }
7482 return true;
7483 }
7484 case X86ISD::VBROADCAST: {
7485 SDValue Src = N.getOperand(0);
7486 MVT SrcVT = Src.getSimpleValueType();
7487 if (!SrcVT.isVector())
7488 return false;
7489
7490 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7491 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7492, __PRETTY_FUNCTION__))
7492 "Illegal broadcast type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7492, __PRETTY_FUNCTION__))
;
7493 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
7494 NumSizeInBits / SrcVT.getScalarSizeInBits());
7495 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7496 DAG.getUNDEF(SrcVT), Src,
7497 DAG.getIntPtrConstant(0, SDLoc(N)));
7498 }
7499
7500 Ops.push_back(Src);
7501 Mask.append(NumElts, 0);
7502 return true;
7503 }
7504 case ISD::ZERO_EXTEND:
7505 case ISD::ANY_EXTEND:
7506 case ISD::ZERO_EXTEND_VECTOR_INREG:
7507 case ISD::ANY_EXTEND_VECTOR_INREG: {
7508 SDValue Src = N.getOperand(0);
7509 EVT SrcVT = Src.getValueType();
7510
7511 // Extended source must be a simple vector.
7512 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7513 (SrcVT.getScalarSizeInBits() % 8) != 0)
7514 return false;
7515
7516 unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
7517 bool IsAnyExtend =
7518 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7519 DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
7520 Mask);
7521
7522 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7523 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7524, __PRETTY_FUNCTION__))
7524 "Illegal zero-extension type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7524, __PRETTY_FUNCTION__))
;
7525 SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
7526 NumSizeInBits / NumSrcBitsPerElt);
7527 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7528 DAG.getUNDEF(SrcVT), Src,
7529 DAG.getIntPtrConstant(0, SDLoc(N)));
7530 }
7531
7532 Ops.push_back(Src);
7533 return true;
7534 }
7535 }
7536
7537 return false;
7538}
7539
7540/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7541static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7542 SmallVectorImpl<int> &Mask) {
7543 int MaskWidth = Mask.size();
7544 SmallVector<SDValue, 16> UsedInputs;
7545 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7546 int lo = UsedInputs.size() * MaskWidth;
7547 int hi = lo + MaskWidth;
7548
7549 // Strip UNDEF input usage.
7550 if (Inputs[i].isUndef())
7551 for (int &M : Mask)
7552 if ((lo <= M) && (M < hi))
7553 M = SM_SentinelUndef;
7554
7555 // Check for unused inputs.
7556 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7557 for (int &M : Mask)
7558 if (lo <= M)
7559 M -= MaskWidth;
7560 continue;
7561 }
7562
7563 // Check for repeated inputs.
7564 bool IsRepeat = false;
7565 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7566 if (UsedInputs[j] != Inputs[i])
7567 continue;
7568 for (int &M : Mask)
7569 if (lo <= M)
7570 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7571 IsRepeat = true;
7572 break;
7573 }
7574 if (IsRepeat)
7575 continue;
7576
7577 UsedInputs.push_back(Inputs[i]);
7578 }
7579 Inputs = UsedInputs;
7580}
7581
7582/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7583/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7584/// Returns true if the target shuffle mask was decoded.
7585static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7586 SmallVectorImpl<SDValue> &Inputs,
7587 SmallVectorImpl<int> &Mask,
7588 APInt &KnownUndef, APInt &KnownZero,
7589 SelectionDAG &DAG, unsigned Depth,
7590 bool ResolveKnownElts) {
7591 EVT VT = Op.getValueType();
7592 if (!VT.isSimple() || !VT.isVector())
15
Calling 'EVT::isSimple'
17
Returning from 'EVT::isSimple'
18
Calling 'EVT::isVector'
24
Returning from 'EVT::isVector'
25
Taking false branch
7593 return false;
7594
7595 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
26
Value assigned to 'OpMask.Size'
27
Assuming the condition is true
28
Taking true branch
7596 if (ResolveKnownElts
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
)
29
Taking false branch
7597 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7598 return true;
30
Returning the value 1, which participates in a condition later
7599 }
7600 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7601 ResolveKnownElts)) {
7602 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7603 return true;
7604 }
7605 return false;
7606}
7607
7608static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7609 SmallVectorImpl<int> &Mask,
7610 SelectionDAG &DAG, unsigned Depth = 0,
7611 bool ResolveKnownElts = true) {
7612 EVT VT = Op.getValueType();
7613 if (!VT.isSimple() || !VT.isVector())
7614 return false;
7615
7616 APInt KnownUndef, KnownZero;
7617 unsigned NumElts = Op.getValueType().getVectorNumElements();
7618 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7619 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7620 KnownZero, DAG, Depth, ResolveKnownElts);
7621}
7622
7623/// Returns the scalar element that will make up the i'th
7624/// element of the result of the vector shuffle.
7625static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
7626 unsigned Depth) {
7627 if (Depth == 6)
7628 return SDValue(); // Limit search depth.
7629
7630 SDValue V = SDValue(N, 0);
7631 EVT VT = V.getValueType();
7632 unsigned Opcode = V.getOpcode();
7633
7634 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7635 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
7636 int Elt = SV->getMaskElt(Index);
7637
7638 if (Elt < 0)
7639 return DAG.getUNDEF(VT.getVectorElementType());
7640
7641 unsigned NumElems = VT.getVectorNumElements();
7642 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
7643 : SV->getOperand(1);
7644 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
7645 }
7646
7647 // Recurse into target specific vector shuffles to find scalars.
7648 if (isTargetShuffle(Opcode)) {
7649 MVT ShufVT = V.getSimpleValueType();
7650 MVT ShufSVT = ShufVT.getVectorElementType();
7651 int NumElems = (int)ShufVT.getVectorNumElements();
7652 SmallVector<int, 16> ShuffleMask;
7653 SmallVector<SDValue, 16> ShuffleOps;
7654 bool IsUnary;
7655
7656 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
7657 return SDValue();
7658
7659 int Elt = ShuffleMask[Index];
7660 if (Elt == SM_SentinelZero)
7661 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
7662 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
7663 if (Elt == SM_SentinelUndef)
7664 return DAG.getUNDEF(ShufSVT);
7665
7666 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7666, __PRETTY_FUNCTION__))
;
7667 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7668 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
7669 Depth+1);
7670 }
7671
7672 // Recurse into insert_subvector base/sub vector to find scalars.
7673 if (Opcode == ISD::INSERT_SUBVECTOR &&
7674 isa<ConstantSDNode>(N->getOperand(2))) {
7675 SDValue Vec = N->getOperand(0);
7676 SDValue Sub = N->getOperand(1);
7677 EVT SubVT = Sub.getValueType();
7678 unsigned NumSubElts = SubVT.getVectorNumElements();
7679 uint64_t SubIdx = N->getConstantOperandVal(2);
7680
7681 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7682 return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
7683 return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
7684 }
7685
7686 // Recurse into extract_subvector src vector to find scalars.
7687 if (Opcode == ISD::EXTRACT_SUBVECTOR &&
7688 isa<ConstantSDNode>(N->getOperand(1))) {
7689 SDValue Src = N->getOperand(0);
7690 uint64_t SrcIdx = N->getConstantOperandVal(1);
7691 return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
7692 }
7693
7694 // Actual nodes that may contain scalar elements
7695 if (Opcode == ISD::BITCAST) {
7696 V = V.getOperand(0);
7697 EVT SrcVT = V.getValueType();
7698 unsigned NumElems = VT.getVectorNumElements();
7699
7700 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
7701 return SDValue();
7702 }
7703
7704 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
7705 return (Index == 0) ? V.getOperand(0)
7706 : DAG.getUNDEF(VT.getVectorElementType());
7707
7708 if (V.getOpcode() == ISD::BUILD_VECTOR)
7709 return V.getOperand(Index);
7710
7711 return SDValue();
7712}
7713
7714// Use PINSRB/PINSRW/PINSRD to create a build vector.
7715static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
7716 unsigned NumNonZero, unsigned NumZero,
7717 SelectionDAG &DAG,
7718 const X86Subtarget &Subtarget) {
7719 MVT VT = Op.getSimpleValueType();
7720 unsigned NumElts = VT.getVectorNumElements();
7721 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7723, __PRETTY_FUNCTION__))
7722 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7723, __PRETTY_FUNCTION__))
7723 "Illegal vector insertion")((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7723, __PRETTY_FUNCTION__))
;
7724
7725 SDLoc dl(Op);
7726 SDValue V;
7727 bool First = true;
7728
7729 for (unsigned i = 0; i < NumElts; ++i) {
7730 bool IsNonZero = (NonZeros & (1 << i)) != 0;
7731 if (!IsNonZero)
7732 continue;
7733
7734 // If the build vector contains zeros or our first insertion is not the
7735 // first index then insert into zero vector to break any register
7736 // dependency else use SCALAR_TO_VECTOR.
7737 if (First) {
7738 First = false;
7739 if (NumZero || 0 != i)
7740 V = getZeroVector(VT, Subtarget, DAG, dl);
7741 else {
7742 assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast
<void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7742, __PRETTY_FUNCTION__))
;
7743 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7744 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
7745 V = DAG.getBitcast(VT, V);
7746 continue;
7747 }
7748 }
7749 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
7750 DAG.getIntPtrConstant(i, dl));
7751 }
7752
7753 return V;
7754}
7755
7756/// Custom lower build_vector of v16i8.
7757static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
7758 unsigned NumNonZero, unsigned NumZero,
7759 SelectionDAG &DAG,
7760 const X86Subtarget &Subtarget) {
7761 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7762 return SDValue();
7763
7764 // SSE4.1 - use PINSRB to insert each byte directly.
7765 if (Subtarget.hasSSE41())
7766 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7767 Subtarget);
7768
7769 SDLoc dl(Op);
7770 SDValue V;
7771
7772 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7773 for (unsigned i = 0; i < 16; i += 2) {
7774 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
7775 bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
7776 if (!ThisIsNonZero && !NextIsNonZero)
7777 continue;
7778
7779 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
7780 SDValue Elt;
7781 if (ThisIsNonZero) {
7782 if (NumZero || NextIsNonZero)
7783 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7784 else
7785 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7786 }
7787
7788 if (NextIsNonZero) {
7789 SDValue NextElt = Op.getOperand(i + 1);
7790 if (i == 0 && NumZero)
7791 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
7792 else
7793 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
7794 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
7795 DAG.getConstant(8, dl, MVT::i8));
7796 if (ThisIsNonZero)
7797 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
7798 else
7799 Elt = NextElt;
7800 }
7801
7802 // If our first insertion is not the first index then insert into zero
7803 // vector to break any register dependency else use SCALAR_TO_VECTOR.
7804 if (!V) {
7805 if (i != 0)
7806 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
7807 else {
7808 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
7809 V = DAG.getBitcast(MVT::v8i16, V);
7810 continue;
7811 }
7812 }
7813 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
7814 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
7815 DAG.getIntPtrConstant(i / 2, dl));
7816 }
7817
7818 return DAG.getBitcast(MVT::v16i8, V);
7819}
7820
7821/// Custom lower build_vector of v8i16.
7822static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
7823 unsigned NumNonZero, unsigned NumZero,
7824 SelectionDAG &DAG,
7825 const X86Subtarget &Subtarget) {
7826 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7827 return SDValue();
7828
7829 // Use PINSRW to insert each byte directly.
7830 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7831 Subtarget);
7832}
7833
7834/// Custom lower build_vector of v4i32 or v4f32.
7835static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
7836 const X86Subtarget &Subtarget) {
7837 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7838 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7839 // Because we're creating a less complicated build vector here, we may enable
7840 // further folding of the MOVDDUP via shuffle transforms.
7841 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7842 Op.getOperand(0) == Op.getOperand(2) &&
7843 Op.getOperand(1) == Op.getOperand(3) &&
7844 Op.getOperand(0) != Op.getOperand(1)) {
7845 SDLoc DL(Op);
7846 MVT VT = Op.getSimpleValueType();
7847 MVT EltVT = VT.getVectorElementType();
7848 // Create a new build vector with the first 2 elements followed by undef
7849 // padding, bitcast to v2f64, duplicate, and bitcast back.
7850 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7851 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7852 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7853 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7854 return DAG.getBitcast(VT, Dup);
7855 }
7856
7857 // Find all zeroable elements.
7858 std::bitset<4> Zeroable, Undefs;
7859 for (int i = 0; i < 4; ++i) {
7860 SDValue Elt = Op.getOperand(i);
7861 Undefs[i] = Elt.isUndef();
7862 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7863 }
7864 assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7865, __PRETTY_FUNCTION__))
7865 "We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7865, __PRETTY_FUNCTION__))
;
7866
7867 // We only know how to deal with build_vector nodes where elements are either
7868 // zeroable or extract_vector_elt with constant index.
7869 SDValue FirstNonZero;
7870 unsigned FirstNonZeroIdx;
7871 for (unsigned i = 0; i < 4; ++i) {
7872 if (Zeroable[i])
7873 continue;
7874 SDValue Elt = Op.getOperand(i);
7875 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7876 !isa<ConstantSDNode>(Elt.getOperand(1)))
7877 return SDValue();
7878 // Make sure that this node is extracting from a 128-bit vector.
7879 MVT VT = Elt.getOperand(0).getSimpleValueType();
7880 if (!VT.is128BitVector())
7881 return SDValue();
7882 if (!FirstNonZero.getNode()) {
7883 FirstNonZero = Elt;
7884 FirstNonZeroIdx = i;
7885 }
7886 }
7887
7888 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7888, __PRETTY_FUNCTION__))
;
7889 SDValue V1 = FirstNonZero.getOperand(0);
7890 MVT VT = V1.getSimpleValueType();
7891
7892 // See if this build_vector can be lowered as a blend with zero.
7893 SDValue Elt;
7894 unsigned EltMaskIdx, EltIdx;
7895 int Mask[4];
7896 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7897 if (Zeroable[EltIdx]) {
7898 // The zero vector will be on the right hand side.
7899 Mask[EltIdx] = EltIdx+4;
7900 continue;
7901 }
7902
7903 Elt = Op->getOperand(EltIdx);
7904 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7905 EltMaskIdx = Elt.getConstantOperandVal(1);
7906 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7907 break;
7908 Mask[EltIdx] = EltIdx;
7909 }
7910
7911 if (EltIdx == 4) {
7912 // Let the shuffle legalizer deal with blend operations.
7913 SDValue VZeroOrUndef = (Zeroable == Undefs)
7914 ? DAG.getUNDEF(VT)
7915 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
7916 if (V1.getSimpleValueType() != VT)
7917 V1 = DAG.getBitcast(VT, V1);
7918 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7919 }
7920
7921 // See if we can lower this build_vector to a INSERTPS.
7922 if (!Subtarget.hasSSE41())
7923 return SDValue();
7924
7925 SDValue V2 = Elt.getOperand(0);
7926 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7927 V1 = SDValue();
7928
7929 bool CanFold = true;
7930 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7931 if (Zeroable[i])
7932 continue;
7933
7934 SDValue Current = Op->getOperand(i);
7935 SDValue SrcVector = Current->getOperand(0);
7936 if (!V1.getNode())
7937 V1 = SrcVector;
7938 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7939 }
7940
7941 if (!CanFold)
7942 return SDValue();
7943
7944 assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7944, __PRETTY_FUNCTION__))
;
7945 if (V1.getSimpleValueType() != MVT::v4f32)
7946 V1 = DAG.getBitcast(MVT::v4f32, V1);
7947 if (V2.getSimpleValueType() != MVT::v4f32)
7948 V2 = DAG.getBitcast(MVT::v4f32, V2);
7949
7950 // Ok, we can emit an INSERTPS instruction.
7951 unsigned ZMask = Zeroable.to_ulong();
7952
7953 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7954 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7954, __PRETTY_FUNCTION__))
;
7955 SDLoc DL(Op);
7956 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7957 DAG.getIntPtrConstant(InsertPSMask, DL, true));
7958 return DAG.getBitcast(VT, Result);
7959}
7960
7961/// Return a vector logical shift node.
7962static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7963 SelectionDAG &DAG, const TargetLowering &TLI,
7964 const SDLoc &dl) {
7965 assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7965, __PRETTY_FUNCTION__))
;
7966 MVT ShVT = MVT::v16i8;
7967 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7968 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7969 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts"
) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7969, __PRETTY_FUNCTION__))
;
7970 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7971 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7972}
7973
7974static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
7975 SelectionDAG &DAG) {
7976
7977 // Check if the scalar load can be widened into a vector load. And if
7978 // the address is "base + cst" see if the cst can be "absorbed" into
7979 // the shuffle mask.
7980 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7981 SDValue Ptr = LD->getBasePtr();
7982 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7983 return SDValue();
7984 EVT PVT = LD->getValueType(0);
7985 if (PVT != MVT::i32 && PVT != MVT::f32)
7986 return SDValue();
7987
7988 int FI = -1;
7989 int64_t Offset = 0;
7990 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7991 FI = FINode->getIndex();
7992 Offset = 0;
7993 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7994 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7995 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7996 Offset = Ptr.getConstantOperandVal(1);
7997 Ptr = Ptr.getOperand(0);
7998 } else {
7999 return SDValue();
8000 }
8001
8002 // FIXME: 256-bit vector instructions don't require a strict alignment,
8003 // improve this code to support it better.
8004 unsigned RequiredAlign = VT.getSizeInBits()/8;
8005 SDValue Chain = LD->getChain();
8006 // Make sure the stack object alignment is at least 16 or 32.
8007 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8008 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
8009 if (MFI.isFixedObjectIndex(FI)) {
8010 // Can't change the alignment. FIXME: It's possible to compute
8011 // the exact stack offset and reference FI + adjust offset instead.
8012 // If someone *really* cares about this. That's the way to implement it.
8013 return SDValue();
8014 } else {
8015 MFI.setObjectAlignment(FI, RequiredAlign);
8016 }
8017 }
8018
8019 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8020 // Ptr + (Offset & ~15).
8021 if (Offset < 0)
8022 return SDValue();
8023 if ((Offset % RequiredAlign) & 3)
8024 return SDValue();
8025 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
8026 if (StartOffset) {
8027 SDLoc DL(Ptr);
8028 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8029 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8030 }
8031
8032 int EltNo = (Offset - StartOffset) >> 2;
8033 unsigned NumElems = VT.getVectorNumElements();
8034
8035 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8036 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8037 LD->getPointerInfo().getWithOffset(StartOffset));
8038
8039 SmallVector<int, 8> Mask(NumElems, EltNo);
8040
8041 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8042 }
8043
8044 return SDValue();
8045}
8046
8047// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8048static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8049 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8050 auto *BaseLd = cast<LoadSDNode>(Elt);
8051 if (!BaseLd->isSimple())
8052 return false;
8053 Ld = BaseLd;
8054 ByteOffset = 0;
8055 return true;
8056 }
8057
8058 switch (Elt.getOpcode()) {
8059 case ISD::BITCAST:
8060 case ISD::TRUNCATE:
8061 case ISD::SCALAR_TO_VECTOR:
8062 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8063 case ISD::SRL:
8064 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
8065 uint64_t Idx = Elt.getConstantOperandVal(1);
8066 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8067 ByteOffset += Idx / 8;
8068 return true;
8069 }
8070 }
8071 break;
8072 case ISD::EXTRACT_VECTOR_ELT:
8073 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
8074 SDValue Src = Elt.getOperand(0);
8075 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8076 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8077 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8078 findEltLoadSrc(Src, Ld, ByteOffset)) {
8079 uint64_t Idx = Elt.getConstantOperandVal(1);
8080 ByteOffset += Idx * (SrcSizeInBits / 8);
8081 return true;
8082 }
8083 }
8084 break;
8085 }
8086
8087 return false;
8088}
8089
8090/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8091/// elements can be replaced by a single large load which has the same value as
8092/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8093///
8094/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8095static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8096 const SDLoc &DL, SelectionDAG &DAG,
8097 const X86Subtarget &Subtarget,
8098 bool isAfterLegalize) {
8099 if ((VT.getScalarSizeInBits() % 8) != 0)
8100 return SDValue();
8101
8102 unsigned NumElems = Elts.size();
8103
8104 int LastLoadedElt = -1;
8105 APInt LoadMask = APInt::getNullValue(NumElems);
8106 APInt ZeroMask = APInt::getNullValue(NumElems);
8107 APInt UndefMask = APInt::getNullValue(NumElems);
8108
8109 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8110 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8111
8112 // For each element in the initializer, see if we've found a load, zero or an
8113 // undef.
8114 for (unsigned i = 0; i < NumElems; ++i) {
8115 SDValue Elt = peekThroughBitcasts(Elts[i]);
8116 if (!Elt.getNode())
8117 return SDValue();
8118 if (Elt.isUndef()) {
8119 UndefMask.setBit(i);
8120 continue;
8121 }
8122 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8123 ZeroMask.setBit(i);
8124 continue;
8125 }
8126
8127 // Each loaded element must be the correct fractional portion of the
8128 // requested vector load.
8129 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8130 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8131 return SDValue();
8132
8133 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8134 return SDValue();
8135 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8136 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8137 return SDValue();
8138
8139 LoadMask.setBit(i);
8140 LastLoadedElt = i;
8141 }
8142 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8144, __PRETTY_FUNCTION__))
8143 LoadMask.countPopulation()) == NumElems &&(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8144, __PRETTY_FUNCTION__))
8144 "Incomplete element masks")(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8144, __PRETTY_FUNCTION__))
;
8145
8146 // Handle Special Cases - all undef or undef/zero.
8147 if (UndefMask.countPopulation() == NumElems)
8148 return DAG.getUNDEF(VT);
8149
8150 // FIXME: Should we return this as a BUILD_VECTOR instead?
8151 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8152 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8153 : DAG.getConstantFP(0.0, DL, VT);
8154
8155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8156 int FirstLoadedElt = LoadMask.countTrailingZeros();
8157 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8158 EVT EltBaseVT = EltBase.getValueType();
8159 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8160, __PRETTY_FUNCTION__))
8160 "Register/Memory size mismatch")((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8160, __PRETTY_FUNCTION__))
;
8161 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8162 assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads"
) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8162, __PRETTY_FUNCTION__))
;
8163 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8164 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8165 int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
8166 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"
) ? static_cast<void> (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8166, __PRETTY_FUNCTION__))
;
8167
8168 // TODO: Support offsetting the base load.
8169 if (ByteOffsets[FirstLoadedElt] != 0)
8170 return SDValue();
8171
8172 // Check to see if the element's load is consecutive to the base load
8173 // or offset from a previous (already checked) load.
8174 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8175 LoadSDNode *Ld = Loads[EltIdx];
8176 int64_t ByteOffset = ByteOffsets[EltIdx];
8177 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8178 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8179 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8180 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8181 }
8182 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8183 EltIdx - FirstLoadedElt);
8184 };
8185
8186 // Consecutive loads can contain UNDEFS but not ZERO elements.
8187 // Consecutive loads with UNDEFs and ZEROs elements require a
8188 // an additional shuffle stage to clear the ZERO elements.
8189 bool IsConsecutiveLoad = true;
8190 bool IsConsecutiveLoadWithZeros = true;
8191 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8192 if (LoadMask[i]) {
8193 if (!CheckConsecutiveLoad(LDBase, i)) {
8194 IsConsecutiveLoad = false;
8195 IsConsecutiveLoadWithZeros = false;
8196 break;
8197 }
8198 } else if (ZeroMask[i]) {
8199 IsConsecutiveLoad = false;
8200 }
8201 }
8202
8203 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8204 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8205 assert(LDBase->isSimple() &&((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8206, __PRETTY_FUNCTION__))
8206 "Cannot merge volatile or atomic loads.")((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8206, __PRETTY_FUNCTION__))
;
8207 SDValue NewLd =
8208 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8209 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
8210 for (auto *LD : Loads)
8211 if (LD)
8212 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8213 return NewLd;
8214 };
8215
8216 // Check if the base load is entirely dereferenceable.
8217 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8218 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8219
8220 // LOAD - all consecutive load/undefs (must start/end with a load or be
8221 // entirely dereferenceable). If we have found an entire vector of loads and
8222 // undefs, then return a large load of the entire vector width starting at the
8223 // base pointer. If the vector contains zeros, then attempt to shuffle those
8224 // elements.
8225 if (FirstLoadedElt == 0 &&
8226 (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
8227 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8228 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8229 return SDValue();
8230
8231 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8232 // will lower to regular temporal loads and use the cache.
8233 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8234 VT.is256BitVector() && !Subtarget.hasInt256())
8235 return SDValue();
8236
8237 if (NumElems == 1)
8238 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8239
8240 if (!ZeroMask)
8241 return CreateLoad(VT, LDBase);
8242
8243 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8244 // vector and a zero vector to clear out the zero elements.
8245 if (!isAfterLegalize && VT.isVector()) {
8246 unsigned NumMaskElts = VT.getVectorNumElements();
8247 if ((NumMaskElts % NumElems) == 0) {
8248 unsigned Scale = NumMaskElts / NumElems;
8249 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8250 for (unsigned i = 0; i < NumElems; ++i) {
8251 if (UndefMask[i])
8252 continue;
8253 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8254 for (unsigned j = 0; j != Scale; ++j)
8255 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8256 }
8257 SDValue V = CreateLoad(VT, LDBase);
8258 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8259 : DAG.getConstantFP(0.0, DL, VT);
8260 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8261 }
8262 }
8263 }
8264
8265 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8266 if (VT.is256BitVector() || VT.is512BitVector()) {
8267 unsigned HalfNumElems = NumElems / 2;
8268 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8269 EVT HalfVT =
8270 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8271 SDValue HalfLD =
8272 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8273 DAG, Subtarget, isAfterLegalize);
8274 if (HalfLD)
8275 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8276 HalfLD, DAG.getIntPtrConstant(0, DL));
8277 }
8278 }
8279
8280 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8281 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8282 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8283 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8284 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8285 : MVT::getIntegerVT(LoadSizeInBits);
8286 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8287 // Allow v4f32 on SSE1 only targets.
8288 // FIXME: Add more isel patterns so we can just use VT directly.
8289 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8290 VecVT = MVT::v4f32;
8291 if (TLI.isTypeLegal(VecVT)) {
8292 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8293 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8294 SDValue ResNode =
8295 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
8296 LDBase->getPointerInfo(),
8297 LDBase->getAlignment(),
8298 MachineMemOperand::MOLoad);
8299 for (auto *LD : Loads)
8300 if (LD)
8301 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8302 return DAG.getBitcast(VT, ResNode);
8303 }
8304 }
8305
8306 // BROADCAST - match the smallest possible repetition pattern, load that
8307 // scalar/subvector element and then broadcast to the entire vector.
8308 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8309 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8310 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8311 unsigned RepeatSize = SubElems * BaseSizeInBits;
8312 unsigned ScalarSize = std::min(RepeatSize, 64u);
8313 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8314 continue;
8315
8316 bool Match = true;
8317 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8318 for (unsigned i = 0; i != NumElems && Match; ++i) {
8319 if (!LoadMask[i])
8320 continue;
8321 SDValue Elt = peekThroughBitcasts(Elts[i]);
8322 if (RepeatedLoads[i % SubElems].isUndef())
8323 RepeatedLoads[i % SubElems] = Elt;
8324 else
8325 Match &= (RepeatedLoads[i % SubElems] == Elt);
8326 }
8327
8328 // We must have loads at both ends of the repetition.
8329 Match &= !RepeatedLoads.front().isUndef();
8330 Match &= !RepeatedLoads.back().isUndef();
8331 if (!Match)
8332 continue;
8333
8334 EVT RepeatVT =
8335 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8336 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8337 : EVT::getFloatingPointVT(ScalarSize);
8338 if (RepeatSize > ScalarSize)
8339 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8340 RepeatSize / ScalarSize);
8341 EVT BroadcastVT =
8342 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8343 VT.getSizeInBits() / ScalarSize);
8344 if (TLI.isTypeLegal(BroadcastVT)) {
8345 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8346 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8347 unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
8348 : X86ISD::VBROADCAST;
8349 SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
8350 return DAG.getBitcast(VT, Broadcast);
8351 }
8352 }
8353 }
8354 }
8355
8356 return SDValue();
8357}
8358
8359// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8360// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8361// are consecutive, non-overlapping, and in the right order.
8362static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
8363 SelectionDAG &DAG,
8364 const X86Subtarget &Subtarget,
8365 bool isAfterLegalize) {
8366 SmallVector<SDValue, 64> Elts;
8367 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8368 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
8369 Elts.push_back(Elt);
8370 continue;
8371 }
8372 return SDValue();
8373 }
8374 assert(Elts.size() == VT.getVectorNumElements())((Elts.size() == VT.getVectorNumElements()) ? static_cast<
void> (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8374, __PRETTY_FUNCTION__))
;
8375 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8376 isAfterLegalize);
8377}
8378
8379static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8380 unsigned SplatBitSize, LLVMContext &C) {
8381 unsigned ScalarSize = VT.getScalarSizeInBits();
8382 unsigned NumElm = SplatBitSize / ScalarSize;
8383
8384 SmallVector<Constant *, 32> ConstantVec;
8385 for (unsigned i = 0; i < NumElm; i++) {
8386 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8387 Constant *Const;
8388 if (VT.isFloatingPoint()) {
8389 if (ScalarSize == 32) {
8390 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8391 } else {
8392 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8392, __PRETTY_FUNCTION__))
;
8393 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8394 }
8395 } else
8396 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8397 ConstantVec.push_back(Const);
8398 }
8399 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8400}
8401
8402static bool isFoldableUseOfShuffle(SDNode *N) {
8403 for (auto *U : N->uses()) {
8404 unsigned Opc = U->getOpcode();
8405 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8406 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8407 return false;
8408 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8409 return false;
8410 if (isTargetShuffle(Opc))
8411 return true;
8412 if (Opc == ISD::BITCAST) // Ignore bitcasts
8413 return isFoldableUseOfShuffle(U);
8414 if (N->hasOneUse())
8415 return true;
8416 }
8417 return false;
8418}
8419
8420// Check if the current node of build vector is a zero extended vector.
8421// // If so, return the value extended.
8422// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
8423// // NumElt - return the number of zero extended identical values.
8424// // EltType - return the type of the value include the zero extend.
8425static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
8426 unsigned &NumElt, MVT &EltType) {
8427 SDValue ExtValue = Op->getOperand(0);
8428 unsigned NumElts = Op->getNumOperands();
8429 unsigned Delta = NumElts;
8430
8431 for (unsigned i = 1; i < NumElts; i++) {
8432 if (Op->getOperand(i) == ExtValue) {
8433 Delta = i;
8434 break;
8435 }
8436 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
8437 return SDValue();
8438 }
8439 if (!isPowerOf2_32(Delta) || Delta == 1)
8440 return SDValue();
8441
8442 for (unsigned i = Delta; i < NumElts; i++) {
8443 if (i % Delta == 0) {
8444 if (Op->getOperand(i) != ExtValue)
8445 return SDValue();
8446 } else if (!(isNullConstant(Op->getOperand(i)) ||
8447 Op->getOperand(i).isUndef()))
8448 return SDValue();
8449 }
8450 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
8451 unsigned ExtVTSize = EltSize * Delta;
8452 EltType = MVT::getIntegerVT(ExtVTSize);
8453 NumElt = NumElts / Delta;
8454 return ExtValue;
8455}
8456
8457/// Attempt to use the vbroadcast instruction to generate a splat value
8458/// from a splat BUILD_VECTOR which uses:
8459/// a. A single scalar load, or a constant.
8460/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8461///
8462/// The VBROADCAST node is returned when a pattern is found,
8463/// or SDValue() otherwise.
8464static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8465 const X86Subtarget &Subtarget,
8466 SelectionDAG &DAG) {
8467 // VBROADCAST requires AVX.
8468 // TODO: Splats could be generated for non-AVX CPUs using SSE
8469 // instructions, but there's less potential gain for only 128-bit vectors.
8470 if (!Subtarget.hasAVX())
8471 return SDValue();
8472
8473 MVT VT = BVOp->getSimpleValueType(0);
8474 SDLoc dl(BVOp);
8475
8476 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8477, __PRETTY_FUNCTION__))
8477 "Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8477, __PRETTY_FUNCTION__))
;
8478
8479 BitVector UndefElements;
8480 SDValue Ld = BVOp->getSplatValue(&UndefElements);
8481
8482 // Attempt to use VBROADCASTM
8483 // From this pattern:
8484 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8485 // b. t1 = (build_vector t0 t0)
8486 //
8487 // Create (VBROADCASTM v2i1 X)
8488 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
8489 MVT EltType = VT.getScalarType();
8490 unsigned NumElts = VT.getVectorNumElements();
8491 SDValue BOperand;
8492 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
8493 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
8494 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
8495 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
8496 if (ZeroExtended)
8497 BOperand = ZeroExtended.getOperand(0);
8498 else
8499 BOperand = Ld.getOperand(0).getOperand(0);
8500 MVT MaskVT = BOperand.getSimpleValueType();
8501 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8502 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8503 SDValue Brdcst =
8504 DAG.getNode(X86ISD::VBROADCASTM, dl,
8505 MVT::getVectorVT(EltType, NumElts), BOperand);
8506 return DAG.getBitcast(VT, Brdcst);
8507 }
8508 }
8509 }
8510
8511 unsigned NumElts = VT.getVectorNumElements();
8512 unsigned NumUndefElts = UndefElements.count();
8513 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8514 APInt SplatValue, Undef;
8515 unsigned SplatBitSize;
8516 bool HasUndef;
8517 // Check if this is a repeated constant pattern suitable for broadcasting.
8518 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8519 SplatBitSize > VT.getScalarSizeInBits() &&
8520 SplatBitSize < VT.getSizeInBits()) {
8521 // Avoid replacing with broadcast when it's a use of a shuffle
8522 // instruction to preserve the present custom lowering of shuffles.
8523 if (isFoldableUseOfShuffle(BVOp))
8524 return SDValue();
8525 // replace BUILD_VECTOR with broadcast of the repeated constants.
8526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8527 LLVMContext *Ctx = DAG.getContext();
8528 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8529 if (Subtarget.hasAVX()) {
8530 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
8531 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
8532 // Splatted value can fit in one INTEGER constant in constant pool.
8533 // Load the constant and broadcast it.
8534 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8535 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8536 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8537 SDValue CP = DAG.getConstantPool(C, PVT);
8538 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8539
8540 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8541 Ld = DAG.getLoad(
8542 CVT, dl, DAG.getEntryNode(), CP,
8543 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8544 Alignment);
8545 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
8546 MVT::getVectorVT(CVT, Repeat), Ld);
8547 return DAG.getBitcast(VT, Brdcst);
8548 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
8549 // Splatted value can fit in one FLOAT constant in constant pool.
8550 // Load the constant and broadcast it.
8551 // AVX have support for 32 and 64 bit broadcast for floats only.
8552 // No 64bit integer in 32bit subtarget.
8553 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
8554 // Lower the splat via APFloat directly, to avoid any conversion.
8555 Constant *C =
8556 SplatBitSize == 32
8557 ? ConstantFP::get(*Ctx,
8558 APFloat(APFloat::IEEEsingle(), SplatValue))
8559 : ConstantFP::get(*Ctx,
8560 APFloat(APFloat::IEEEdouble(), SplatValue));
8561 SDValue CP = DAG.getConstantPool(C, PVT);
8562 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8563
8564 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8565 Ld = DAG.getLoad(
8566 CVT, dl, DAG.getEntryNode(), CP,
8567 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8568 Alignment);
8569 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
8570 MVT::getVectorVT(CVT, Repeat), Ld);
8571 return DAG.getBitcast(VT, Brdcst);
8572 } else if (SplatBitSize > 64) {
8573 // Load the vector of constants and broadcast it.
8574 MVT CVT = VT.getScalarType();
8575 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8576 *Ctx);
8577 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8578 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8579 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
8580 Ld = DAG.getLoad(
8581 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
8582 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8583 Alignment);
8584 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
8585 return DAG.getBitcast(VT, Brdcst);
8586 }
8587 }
8588 }
8589
8590 // If we are moving a scalar into a vector (Ld must be set and all elements
8591 // but 1 are undef) and that operation is not obviously supported by
8592 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8593 // That's better than general shuffling and may eliminate a load to GPR and
8594 // move from scalar to vector register.
8595 if (!Ld || NumElts - NumUndefElts != 1)
8596 return SDValue();
8597 unsigned ScalarSize = Ld.getValueSizeInBits();
8598 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8599 return SDValue();
8600 }
8601
8602 bool ConstSplatVal =
8603 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8604
8605 // Make sure that all of the users of a non-constant load are from the
8606 // BUILD_VECTOR node.
8607 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
8608 return SDValue();
8609
8610 unsigned ScalarSize = Ld.getValueSizeInBits();
8611 bool IsGE256 = (VT.getSizeInBits() >= 256);
8612
8613 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8614 // instruction to save 8 or more bytes of constant pool data.
8615 // TODO: If multiple splats are generated to load the same constant,
8616 // it may be detrimental to overall size. There needs to be a way to detect
8617 // that condition to know if this is truly a size win.
8618 bool OptForSize = DAG.shouldOptForSize();
8619
8620 // Handle broadcasting a single constant scalar from the constant pool
8621 // into a vector.
8622 // On Sandybridge (no AVX2), it is still better to load a constant vector
8623 // from the constant pool and not to broadcast it from a scalar.
8624 // But override that restriction when optimizing for size.
8625 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8626 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8627 EVT CVT = Ld.getValueType();
8628 assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8628, __PRETTY_FUNCTION__))
;
8629
8630 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8631 // For size optimization, also splat v2f64 and v2i64, and for size opt
8632 // with AVX2, also splat i8 and i16.
8633 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8634 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8635 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8636 const Constant *C = nullptr;
8637 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8638 C = CI->getConstantIntValue();
8639 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8640 C = CF->getConstantFPValue();
8641
8642 assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8642, __PRETTY_FUNCTION__))
;
8643
8644 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8645 SDValue CP =
8646 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
8647 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8648 Ld = DAG.getLoad(
8649 CVT, dl, DAG.getEntryNode(), CP,
8650 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8651 Alignment);
8652
8653 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8654 }
8655 }
8656
8657 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8658
8659 // Handle AVX2 in-register broadcasts.
8660 if (!IsLoad && Subtarget.hasInt256() &&
8661 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8662 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8663
8664 // The scalar source must be a normal load.
8665 if (!IsLoad)
8666 return SDValue();
8667
8668 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8669 (Subtarget.hasVLX() && ScalarSize == 64))
8670 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8671
8672 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8673 // double since there is no vbroadcastsd xmm
8674 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
8675 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
8676 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8677 }
8678
8679 // Unsupported broadcast.
8680 return SDValue();
8681}
8682
8683/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8684/// underlying vector and index.
8685///
8686/// Modifies \p ExtractedFromVec to the real vector and returns the real
8687/// index.
8688static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8689 SDValue ExtIdx) {
8690 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
8691 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8692 return Idx;
8693
8694 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8695 // lowered this:
8696 // (extract_vector_elt (v8f32 %1), Constant<6>)
8697 // to:
8698 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8699 // (extract_subvector (v8f32 %0), Constant<4>),
8700 // undef)
8701 // Constant<0>)
8702 // In this case the vector is the extract_subvector expression and the index
8703 // is 2, as specified by the shuffle.
8704 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8705 SDValue ShuffleVec = SVOp->getOperand(0);
8706 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8707 assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8708, __PRETTY_FUNCTION__))
8708 ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8708, __PRETTY_FUNCTION__))
;
8709
8710 int ShuffleIdx = SVOp->getMaskElt(Idx);
8711 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8712 ExtractedFromVec = ShuffleVec;
8713 return ShuffleIdx;
8714 }
8715 return Idx;
8716}
8717
8718static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
8719 MVT VT = Op.getSimpleValueType();
8720
8721 // Skip if insert_vec_elt is not supported.
8722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8723 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
8724 return SDValue();
8725
8726 SDLoc DL(Op);
8727 unsigned NumElems = Op.getNumOperands();
8728
8729 SDValue VecIn1;
8730 SDValue VecIn2;
8731 SmallVector<unsigned, 4> InsertIndices;
8732 SmallVector<int, 8> Mask(NumElems, -1);
8733
8734 for (unsigned i = 0; i != NumElems; ++i) {
8735 unsigned Opc = Op.getOperand(i).getOpcode();
8736
8737 if (Opc == ISD::UNDEF)
8738 continue;
8739
8740 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
8741 // Quit if more than 1 elements need inserting.
8742 if (InsertIndices.size() > 1)
8743 return SDValue();
8744
8745 InsertIndices.push_back(i);
8746 continue;
8747 }
8748
8749 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8750 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8751
8752 // Quit if non-constant index.
8753 if (!isa<ConstantSDNode>(ExtIdx))
8754 return SDValue();
8755 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8756
8757 // Quit if extracted from vector of different type.
8758 if (ExtractedFromVec.getValueType() != VT)
8759 return SDValue();
8760
8761 if (!VecIn1.getNode())
8762 VecIn1 = ExtractedFromVec;
8763 else if (VecIn1 != ExtractedFromVec) {
8764 if (!VecIn2.getNode())
8765 VecIn2 = ExtractedFromVec;
8766 else if (VecIn2 != ExtractedFromVec)
8767 // Quit if more than 2 vectors to shuffle
8768 return SDValue();
8769 }
8770
8771 if (ExtractedFromVec == VecIn1)
8772 Mask[i] = Idx;
8773 else if (ExtractedFromVec == VecIn2)
8774 Mask[i] = Idx + NumElems;
8775 }
8776
8777 if (!VecIn1.getNode())
8778 return SDValue();
8779
8780 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
8781 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8782
8783 for (unsigned Idx : InsertIndices)
8784 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8785 DAG.getIntPtrConstant(Idx, DL));
8786
8787 return NV;
8788}
8789
8790// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8791static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
8792 const X86Subtarget &Subtarget) {
8793
8794 MVT VT = Op.getSimpleValueType();
8795 assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8796, __PRETTY_FUNCTION__))
8796 "Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8796, __PRETTY_FUNCTION__))
;
8797
8798 SDLoc dl(Op);
8799 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8800 ISD::isBuildVectorAllOnes(Op.getNode()))
8801 return Op;
8802
8803 uint64_t Immediate = 0;
8804 SmallVector<unsigned, 16> NonConstIdx;
8805 bool IsSplat = true;
8806 bool HasConstElts = false;
8807 int SplatIdx = -1;
8808 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8809 SDValue In = Op.getOperand(idx);
8810 if (In.isUndef())
8811 continue;
8812 if (!isa<ConstantSDNode>(In))
8813 NonConstIdx.push_back(idx);
8814 else {
8815 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
8816 HasConstElts = true;
8817 }
8818 if (SplatIdx < 0)
8819 SplatIdx = idx;
8820 else if (In != Op.getOperand(SplatIdx))
8821 IsSplat = false;
8822 }
8823
8824 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8825 if (IsSplat) {
8826 // The build_vector allows the scalar element to be larger than the vector
8827 // element type. We need to mask it to use as a condition unless we know
8828 // the upper bits are zero.
8829 // FIXME: Use computeKnownBits instead of checking specific opcode?
8830 SDValue Cond = Op.getOperand(SplatIdx);
8831 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((Cond.getValueType() == MVT::i8 && "Unexpected VT!")
? static_cast<void> (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8831, __PRETTY_FUNCTION__))
;
8832 if (Cond.getOpcode() != ISD::SETCC)
8833 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8834 DAG.getConstant(1, dl, MVT::i8));
8835
8836 // Perform the select in the scalar domain so we can use cmov.
8837 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8838 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8839 DAG.getAllOnesConstant(dl, MVT::i32),
8840 DAG.getConstant(0, dl, MVT::i32));
8841 Select = DAG.getBitcast(MVT::v32i1, Select);
8842 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8843 } else {
8844 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8845 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8846 DAG.getAllOnesConstant(dl, ImmVT),
8847 DAG.getConstant(0, dl, ImmVT));
8848 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8849 Select = DAG.getBitcast(VecVT, Select);
8850 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8851 DAG.getIntPtrConstant(0, dl));
8852 }
8853 }
8854
8855 // insert elements one by one
8856 SDValue DstVec;
8857 if (HasConstElts) {
8858 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8859 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8860 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8861 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8862 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8863 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8864 } else {
8865 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8866 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8867 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8868 DstVec = DAG.getBitcast(VecVT, Imm);
8869 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8870 DAG.getIntPtrConstant(0, dl));
8871 }
8872 } else
8873 DstVec = DAG.getUNDEF(VT);
8874
8875 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
8876 unsigned InsertIdx = NonConstIdx[i];
8877 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8878 Op.getOperand(InsertIdx),
8879 DAG.getIntPtrConstant(InsertIdx, dl));
8880 }
8881 return DstVec;
8882}
8883
8884/// This is a helper function of LowerToHorizontalOp().
8885/// This function checks that the build_vector \p N in input implements a
8886/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8887/// may not match the layout of an x86 256-bit horizontal instruction.
8888/// In other words, if this returns true, then some extraction/insertion will
8889/// be required to produce a valid horizontal instruction.
8890///
8891/// Parameter \p Opcode defines the kind of horizontal operation to match.
8892/// For example, if \p Opcode is equal to ISD::ADD, then this function
8893/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8894/// is equal to ISD::SUB, then this function checks if this is a horizontal
8895/// arithmetic sub.
8896///
8897/// This function only analyzes elements of \p N whose indices are
8898/// in range [BaseIdx, LastIdx).
8899///
8900/// TODO: This function was originally used to match both real and fake partial
8901/// horizontal operations, but the index-matching logic is incorrect for that.
8902/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8903/// code because it is only used for partial h-op matching now?
8904static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8905 SelectionDAG &DAG,
8906 unsigned BaseIdx, unsigned LastIdx,
8907 SDValue &V0, SDValue &V1) {
8908 EVT VT = N->getValueType(0);
8909 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8909, __PRETTY_FUNCTION__))
;
8910 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8910, __PRETTY_FUNCTION__))
;
8911 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8912, __PRETTY_FUNCTION__))
8912 "Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8912, __PRETTY_FUNCTION__))
;
8913
8914 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8915 bool CanFold = true;
8916 unsigned ExpectedVExtractIdx = BaseIdx;
8917 unsigned NumElts = LastIdx - BaseIdx;
8918 V0 = DAG.getUNDEF(VT);
8919 V1 = DAG.getUNDEF(VT);
8920
8921 // Check if N implements a horizontal binop.
8922 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8923 SDValue Op = N->getOperand(i + BaseIdx);
8924
8925 // Skip UNDEFs.
8926 if (Op->isUndef()) {
8927 // Update the expected vector extract index.
8928 if (i * 2 == NumElts)
8929 ExpectedVExtractIdx = BaseIdx;
8930 ExpectedVExtractIdx += 2;
8931 continue;
8932 }
8933
8934 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8935
8936 if (!CanFold)
8937 break;
8938
8939 SDValue Op0 = Op.getOperand(0);
8940 SDValue Op1 = Op.getOperand(1);
8941
8942 // Try to match the following pattern:
8943 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8944 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8945 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8946 Op0.getOperand(0) == Op1.getOperand(0) &&
8947 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8948 isa<ConstantSDNode>(Op1.getOperand(1)));
8949 if (!CanFold)
8950 break;
8951
8952 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
8953 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
8954
8955 if (i * 2 < NumElts) {
8956 if (V0.isUndef()) {
8957 V0 = Op0.getOperand(0);
8958 if (V0.getValueType() != VT)
8959 return false;
8960 }
8961 } else {
8962 if (V1.isUndef()) {
8963 V1 = Op0.getOperand(0);
8964 if (V1.getValueType() != VT)
8965 return false;
8966 }
8967 if (i * 2 == NumElts)
8968 ExpectedVExtractIdx = BaseIdx;
8969 }
8970
8971 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8972 if (I0 == ExpectedVExtractIdx)
8973 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8974 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8975 // Try to match the following dag sequence:
8976 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8977 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8978 } else
8979 CanFold = false;
8980
8981 ExpectedVExtractIdx += 2;
8982 }
8983
8984 return CanFold;
8985}
8986
8987/// Emit a sequence of two 128-bit horizontal add/sub followed by
8988/// a concat_vector.
8989///
8990/// This is a helper function of LowerToHorizontalOp().
8991/// This function expects two 256-bit vectors called V0 and V1.
8992/// At first, each vector is split into two separate 128-bit vectors.
8993/// Then, the resulting 128-bit vectors are used to implement two
8994/// horizontal binary operations.
8995///
8996/// The kind of horizontal binary operation is defined by \p X86Opcode.
8997///
8998/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8999/// the two new horizontal binop.
9000/// When Mode is set, the first horizontal binop dag node would take as input
9001/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9002/// horizontal binop dag node would take as input the lower 128-bit of V1
9003/// and the upper 128-bit of V1.
9004/// Example:
9005/// HADD V0_LO, V0_HI
9006/// HADD V1_LO, V1_HI
9007///
9008/// Otherwise, the first horizontal binop dag node takes as input the lower
9009/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9010/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9011/// Example:
9012/// HADD V0_LO, V1_LO
9013/// HADD V0_HI, V1_HI
9014///
9015/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9016/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9017/// the upper 128-bits of the result.
9018static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9019 const SDLoc &DL, SelectionDAG &DAG,
9020 unsigned X86Opcode, bool Mode,
9021 bool isUndefLO, bool isUndefHI) {
9022 MVT VT = V0.getSimpleValueType();
9023 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9024, __PRETTY_FUNCTION__))
9024 "Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9024, __PRETTY_FUNCTION__))
;
9025
9026 unsigned NumElts = VT.getVectorNumElements();
9027 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9028 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9029 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9030 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9031 MVT NewVT = V0_LO.getSimpleValueType();
9032
9033 SDValue LO = DAG.getUNDEF(NewVT);
9034 SDValue HI = DAG.getUNDEF(NewVT);
9035
9036 if (Mode) {
9037 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9038 if (!isUndefLO && !V0->isUndef())
9039 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9040 if (!isUndefHI && !V1->isUndef())
9041 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9042 } else {
9043 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9044 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9045 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9046
9047 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9048 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9049 }
9050
9051 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9052}
9053
9054/// Returns true iff \p BV builds a vector with the result equivalent to
9055/// the result of ADDSUB/SUBADD operation.
9056/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9057/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9058/// \p Opnd0 and \p Opnd1.
9059static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9060 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9061 SDValue &Opnd0, SDValue &Opnd1,
9062 unsigned &NumExtracts,
9063 bool &IsSubAdd) {
9064
9065 MVT VT = BV->getSimpleValueType(0);
9066 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9067 return false;
9068
9069 unsigned NumElts = VT.getVectorNumElements();
9070 SDValue InVec0 = DAG.getUNDEF(VT);
9071 SDValue InVec1 = DAG.getUNDEF(VT);
9072
9073 NumExtracts = 0;
9074
9075 // Odd-numbered elements in the input build vector are obtained from
9076 // adding/subtracting two integer/float elements.
9077 // Even-numbered elements in the input build vector are obtained from
9078 // subtracting/adding two integer/float elements.
9079 unsigned Opc[2] = {0, 0};
9080 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9081 SDValue Op = BV->getOperand(i);
9082
9083 // Skip 'undef' values.
9084 unsigned Opcode = Op.getOpcode();
9085 if (Opcode == ISD::UNDEF)
9086 continue;
9087
9088 // Early exit if we found an unexpected opcode.
9089 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9090 return false;
9091
9092 SDValue Op0 = Op.getOperand(0);
9093 SDValue Op1 = Op.getOperand(1);
9094
9095 // Try to match the following pattern:
9096 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9097 // Early exit if we cannot match that sequence.
9098 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9099 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9100 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9101 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
9102 Op0.getOperand(1) != Op1.getOperand(1))
9103 return false;
9104
9105 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
9106 if (I0 != i)
9107 return false;
9108
9109 // We found a valid add/sub node, make sure its the same opcode as previous
9110 // elements for this parity.
9111 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9112 return false;
9113 Opc[i % 2] = Opcode;
9114
9115 // Update InVec0 and InVec1.
9116 if (InVec0.isUndef()) {
9117 InVec0 = Op0.getOperand(0);
9118 if (InVec0.getSimpleValueType() != VT)
9119 return false;
9120 }
9121 if (InVec1.isUndef()) {
9122 InVec1 = Op1.getOperand(0);
9123 if (InVec1.getSimpleValueType() != VT)
9124 return false;
9125 }
9126
9127 // Make sure that operands in input to each add/sub node always
9128 // come from a same pair of vectors.
9129 if (InVec0 != Op0.getOperand(0)) {
9130 if (Opcode == ISD::FSUB)
9131 return false;
9132
9133 // FADD is commutable. Try to commute the operands
9134 // and then test again.
9135 std::swap(Op0, Op1);
9136 if (InVec0 != Op0.getOperand(0))
9137 return false;
9138 }
9139
9140 if (InVec1 != Op1.getOperand(0))
9141 return false;
9142
9143 // Increment the number of extractions done.
9144 ++NumExtracts;
9145 }
9146
9147 // Ensure we have found an opcode for both parities and that they are
9148 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9149 // inputs are undef.
9150 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9151 InVec0.isUndef() || InVec1.isUndef())
9152 return false;
9153
9154 IsSubAdd = Opc[0] == ISD::FADD;
9155
9156 Opnd0 = InVec0;
9157 Opnd1 = InVec1;
9158 return true;
9159}
9160
9161/// Returns true if is possible to fold MUL and an idiom that has already been
9162/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9163/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9164/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9165///
9166/// Prior to calling this function it should be known that there is some
9167/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9168/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9169/// before replacement of such SDNode with ADDSUB operation. Thus the number
9170/// of \p Opnd0 uses is expected to be equal to 2.
9171/// For example, this function may be called for the following IR:
9172/// %AB = fmul fast <2 x double> %A, %B
9173/// %Sub = fsub fast <2 x double> %AB, %C
9174/// %Add = fadd fast <2 x double> %AB, %C
9175/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9176/// <2 x i32> <i32 0, i32 3>
9177/// There is a def for %Addsub here, which potentially can be replaced by
9178/// X86ISD::ADDSUB operation:
9179/// %Addsub = X86ISD::ADDSUB %AB, %C
9180/// and such ADDSUB can further be replaced with FMADDSUB:
9181/// %Addsub = FMADDSUB %A, %B, %C.
9182///
9183/// The main reason why this method is called before the replacement of the
9184/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9185/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9186/// FMADDSUB is.
9187static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9188 SelectionDAG &DAG,
9189 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9190 unsigned ExpectedUses) {
9191 if (Opnd0.getOpcode() != ISD::FMUL ||
9192 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9193 return false;
9194
9195 // FIXME: These checks must match the similar ones in
9196 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9197 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9198 // or MUL + ADDSUB to FMADDSUB.
9199 const TargetOptions &Options = DAG.getTarget().Options;
9200 bool AllowFusion =
9201 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9202 if (!AllowFusion)
9203 return false;
9204
9205 Opnd2 = Opnd1;
9206 Opnd1 = Opnd0.getOperand(1);
9207 Opnd0 = Opnd0.getOperand(0);
9208
9209 return true;
9210}
9211
9212/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9213/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9214/// X86ISD::FMSUBADD node.
9215static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9216 const X86Subtarget &Subtarget,
9217 SelectionDAG &DAG) {
9218 SDValue Opnd0, Opnd1;
9219 unsigned NumExtracts;
9220 bool IsSubAdd;
9221 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9222 IsSubAdd))
9223 return SDValue();
9224
9225 MVT VT = BV->getSimpleValueType(0);
9226 SDLoc DL(BV);
9227
9228 // Try to generate X86ISD::FMADDSUB node here.
9229 SDValue Opnd2;
9230 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9231 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9232 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9233 }
9234
9235 // We only support ADDSUB.
9236 if (IsSubAdd)
9237 return SDValue();
9238
9239 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9240 // the ADDSUB idiom has been successfully recognized. There are no known
9241 // X86 targets with 512-bit ADDSUB instructions!
9242 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9243 // recognition.
9244 if (VT.is512BitVector())
9245 return SDValue();
9246
9247 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9248}
9249
9250static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9251 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9252 // Initialize outputs to known values.
9253 MVT VT = BV->getSimpleValueType(0);
9254 HOpcode = ISD::DELETED_NODE;
9255 V0 = DAG.getUNDEF(VT);
9256 V1 = DAG.getUNDEF(VT);
9257
9258 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9259 // half of the result is calculated independently from the 128-bit halves of
9260 // the inputs, so that makes the index-checking logic below more complicated.
9261 unsigned NumElts = VT.getVectorNumElements();
9262 unsigned GenericOpcode = ISD::DELETED_NODE;
9263 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9264 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9265 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9266 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9267 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9268 // Ignore undef elements.
9269 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9270 if (Op.isUndef())
9271 continue;
9272
9273 // If there's an opcode mismatch, we're done.
9274 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9275 return false;
9276
9277 // Initialize horizontal opcode.
9278 if (HOpcode == ISD::DELETED_NODE) {
9279 GenericOpcode = Op.getOpcode();
9280 switch (GenericOpcode) {
9281 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9282 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9283 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9284 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9285 default: return false;
9286 }
9287 }
9288
9289 SDValue Op0 = Op.getOperand(0);
9290 SDValue Op1 = Op.getOperand(1);
9291 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9292 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9293 Op0.getOperand(0) != Op1.getOperand(0) ||
9294 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9295 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9296 return false;
9297
9298 // The source vector is chosen based on which 64-bit half of the
9299 // destination vector is being calculated.
9300 if (j < NumEltsIn64Bits) {
9301 if (V0.isUndef())
9302 V0 = Op0.getOperand(0);
9303 } else {
9304 if (V1.isUndef())
9305 V1 = Op0.getOperand(0);
9306 }
9307
9308 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9309 if (SourceVec != Op0.getOperand(0))
9310 return false;
9311
9312 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9313 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9314 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9315 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9316 (j % NumEltsIn64Bits) * 2;
9317 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9318 continue;
9319
9320 // If this is not a commutative op, this does not match.
9321 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9322 return false;
9323
9324 // Addition is commutative, so try swapping the extract indexes.
9325 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9326 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9327 continue;
9328
9329 // Extract indexes do not match horizontal requirement.
9330 return false;
9331 }
9332 }
9333 // We matched. Opcode and operands are returned by reference as arguments.
9334 return true;
9335}
9336
9337static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9338 SelectionDAG &DAG, unsigned HOpcode,
9339 SDValue V0, SDValue V1) {
9340 // If either input vector is not the same size as the build vector,
9341 // extract/insert the low bits to the correct size.
9342 // This is free (examples: zmm --> xmm, xmm --> ymm).
9343 MVT VT = BV->getSimpleValueType(0);
9344 unsigned Width = VT.getSizeInBits();
9345 if (V0.getValueSizeInBits() > Width)
9346 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9347 else if (V0.getValueSizeInBits() < Width)
9348 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9349
9350 if (V1.getValueSizeInBits() > Width)
9351 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9352 else if (V1.getValueSizeInBits() < Width)
9353 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9354
9355 unsigned NumElts = VT.getVectorNumElements();
9356 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9357 for (unsigned i = 0; i != NumElts; ++i)
9358 if (BV->getOperand(i).isUndef())
9359 DemandedElts.clearBit(i);
9360
9361 // If we don't need the upper xmm, then perform as a xmm hop.
9362 unsigned HalfNumElts = NumElts / 2;
9363 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9364 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9365 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9366 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9367 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9368 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9369 }
9370
9371 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9372}
9373
9374/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9375static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9376 const X86Subtarget &Subtarget,
9377 SelectionDAG &DAG) {
9378 // We need at least 2 non-undef elements to make this worthwhile by default.
9379 unsigned NumNonUndefs =
9380 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9381 if (NumNonUndefs < 2)
9382 return SDValue();
9383
9384 // There are 4 sets of horizontal math operations distinguished by type:
9385 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9386 // subtarget feature. Try to match those "native" patterns first.
9387 MVT VT = BV->getSimpleValueType(0);
9388 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9389 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9390 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9391 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9392 unsigned HOpcode;
9393 SDValue V0, V1;
9394 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9395 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9396 }
9397
9398 // Try harder to match 256-bit ops by using extract/concat.
9399 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9400 return SDValue();
9401
9402 // Count the number of UNDEF operands in the build_vector in input.
9403 unsigned NumElts = VT.getVectorNumElements();
9404 unsigned Half = NumElts / 2;
9405 unsigned NumUndefsLO = 0;
9406 unsigned NumUndefsHI = 0;
9407 for (unsigned i = 0, e = Half; i != e; ++i)
9408 if (BV->getOperand(i)->isUndef())
9409 NumUndefsLO++;
9410
9411 for (unsigned i = Half, e = NumElts; i != e; ++i)
9412 if (BV->getOperand(i)->isUndef())
9413 NumUndefsHI++;
9414
9415 SDLoc DL(BV);
9416 SDValue InVec0, InVec1;
9417 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9418 SDValue InVec2, InVec3;
9419 unsigned X86Opcode;
9420 bool CanFold = true;
9421
9422 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9423 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9424 InVec3) &&
9425 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9426 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9427 X86Opcode = X86ISD::HADD;
9428 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9429 InVec1) &&
9430 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9431 InVec3) &&
9432 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9433 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9434 X86Opcode = X86ISD::HSUB;
9435 else
9436 CanFold = false;
9437
9438 if (CanFold) {
9439 // Do not try to expand this build_vector into a pair of horizontal
9440 // add/sub if we can emit a pair of scalar add/sub.
9441 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9442 return SDValue();
9443
9444 // Convert this build_vector into a pair of horizontal binops followed by
9445 // a concat vector. We must adjust the outputs from the partial horizontal
9446 // matching calls above to account for undefined vector halves.
9447 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9448 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9449 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"
) ? static_cast<void> (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9449, __PRETTY_FUNCTION__))
;
9450 bool isUndefLO = NumUndefsLO == Half;
9451 bool isUndefHI = NumUndefsHI == Half;
9452 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9453 isUndefHI);
9454 }
9455 }
9456
9457 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9458 VT == MVT::v16i16) {
9459 unsigned X86Opcode;
9460 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9461 X86Opcode = X86ISD::HADD;
9462 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9463 InVec1))
9464 X86Opcode = X86ISD::HSUB;
9465 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9466 InVec1))
9467 X86Opcode = X86ISD::FHADD;
9468 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9469 InVec1))
9470 X86Opcode = X86ISD::FHSUB;
9471 else
9472 return SDValue();
9473
9474 // Don't try to expand this build_vector into a pair of horizontal add/sub
9475 // if we can simply emit a pair of scalar add/sub.
9476 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9477 return SDValue();
9478
9479 // Convert this build_vector into two horizontal add/sub followed by
9480 // a concat vector.
9481 bool isUndefLO = NumUndefsLO == Half;
9482 bool isUndefHI = NumUndefsHI == Half;
9483 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9484 isUndefLO, isUndefHI);
9485 }
9486
9487 return SDValue();
9488}
9489
9490/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9491/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9492/// just apply the bit to the vectors.
9493/// NOTE: Its not in our interest to start make a general purpose vectorizer
9494/// from this, but enough scalar bit operations are created from the later
9495/// legalization + scalarization stages to need basic support.
9496static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9497 SelectionDAG &DAG) {
9498 SDLoc DL(Op);
9499 MVT VT = Op->getSimpleValueType(0);
9500 unsigned NumElems = VT.getVectorNumElements();
9501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9502
9503 // Check that all elements have the same opcode.
9504 // TODO: Should we allow UNDEFS and if so how many?
9505 unsigned Opcode = Op->getOperand(0).getOpcode();
9506 for (unsigned i = 1; i < NumElems; ++i)
9507 if (Opcode != Op->getOperand(i).getOpcode())
9508 return SDValue();
9509
9510 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9511 bool IsShift = false;
9512 switch (Opcode) {
9513 default:
9514 return SDValue();
9515 case ISD::SHL:
9516 case ISD::SRL:
9517 case ISD::SRA:
9518 IsShift = true;
9519 break;
9520 case ISD::AND:
9521 case ISD::XOR:
9522 case ISD::OR:
9523 // Don't do this if the buildvector is a splat - we'd replace one
9524 // constant with an entire vector.
9525 if (Op->getSplatValue())
9526 return SDValue();
9527 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9528 return SDValue();
9529 break;
9530 }
9531
9532 SmallVector<SDValue, 4> LHSElts, RHSElts;
9533 for (SDValue Elt : Op->ops()) {
9534 SDValue LHS = Elt.getOperand(0);
9535 SDValue RHS = Elt.getOperand(1);
9536
9537 // We expect the canonicalized RHS operand to be the constant.
9538 if (!isa<ConstantSDNode>(RHS))
9539 return SDValue();
9540
9541 // Extend shift amounts.
9542 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9543 if (!IsShift)
9544 return SDValue();
9545 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9546 }
9547
9548 LHSElts.push_back(LHS);
9549 RHSElts.push_back(RHS);
9550 }
9551
9552 // Limit to shifts by uniform immediates.
9553 // TODO: Only accept vXi8/vXi64 special cases?
9554 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9555 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9556 return SDValue();
9557
9558 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9559 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9560 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
9561}
9562
9563/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9564/// functionality to do this, so it's all zeros, all ones, or some derivation
9565/// that is cheap to calculate.
9566static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9567 const X86Subtarget &Subtarget) {
9568 SDLoc DL(Op);
9569 MVT VT = Op.getSimpleValueType();
9570
9571 // Vectors containing all zeros can be matched by pxor and xorps.
9572 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9573 return Op;
9574
9575 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9576 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9577 // vpcmpeqd on 256-bit vectors.
9578 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9579 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9580 return Op;
9581
9582 return getOnesVector(VT, DAG, DL);
9583 }
9584
9585 return SDValue();
9586}
9587
9588/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9589/// from a vector of source values and a vector of extraction indices.
9590/// The vectors might be manipulated to match the type of the permute op.
9591static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9592 SDLoc &DL, SelectionDAG &DAG,
9593 const X86Subtarget &Subtarget) {
9594 MVT ShuffleVT = VT;
9595 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9596 unsigned NumElts = VT.getVectorNumElements();
9597 unsigned SizeInBits = VT.getSizeInBits();
9598
9599 // Adjust IndicesVec to match VT size.
9600 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9601, __PRETTY_FUNCTION__))
9601 "Illegal variable permute mask size")((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9601, __PRETTY_FUNCTION__))
;
9602 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9603 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9604 NumElts * VT.getScalarSizeInBits());
9605 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9606
9607 // Handle SrcVec that don't match VT type.
9608 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9609 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9610 // Handle larger SrcVec by treating it as a larger permute.
9611 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9612 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9613 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9614 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9615 Subtarget, DAG, SDLoc(IndicesVec));
9616 return extractSubVector(
9617 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
9618 DAG, DL, SizeInBits);
9619 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9620 // Widen smaller SrcVec to match VT.
9621 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9622 } else
9623 return SDValue();
9624 }
9625
9626 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9627 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9627, __PRETTY_FUNCTION__))
;
9628 EVT SrcVT = Idx.getValueType();
9629 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9630 uint64_t IndexScale = 0;
9631 uint64_t IndexOffset = 0;
9632
9633 // If we're scaling a smaller permute op, then we need to repeat the
9634 // indices, scaling and offsetting them as well.
9635 // e.g. v4i32 -> v16i8 (Scale = 4)
9636 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9637 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9638 for (uint64_t i = 0; i != Scale; ++i) {
9639 IndexScale |= Scale << (i * NumDstBits);
9640 IndexOffset |= i << (i * NumDstBits);
9641 }
9642
9643 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9644 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9645 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9646 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9647 return Idx;
9648 };
9649
9650 unsigned Opcode = 0;
9651 switch (VT.SimpleTy) {
9652 default:
9653 break;
9654 case MVT::v16i8:
9655 if (Subtarget.hasSSSE3())
9656 Opcode = X86ISD::PSHUFB;
9657 break;
9658 case MVT::v8i16:
9659 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9660 Opcode = X86ISD::VPERMV;
9661 else if (Subtarget.hasSSSE3()) {
9662 Opcode = X86ISD::PSHUFB;
9663 ShuffleVT = MVT::v16i8;
9664 }
9665 break;
9666 case MVT::v4f32:
9667 case MVT::v4i32:
9668 if (Subtarget.hasAVX()) {
9669 Opcode = X86ISD::VPERMILPV;
9670 ShuffleVT = MVT::v4f32;
9671 } else if (Subtarget.hasSSSE3()) {
9672 Opcode = X86ISD::PSHUFB;
9673 ShuffleVT = MVT::v16i8;
9674 }
9675 break;
9676 case MVT::v2f64:
9677 case MVT::v2i64:
9678 if (Subtarget.hasAVX()) {
9679 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9680 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9681 Opcode = X86ISD::VPERMILPV;
9682 ShuffleVT = MVT::v2f64;
9683 } else if (Subtarget.hasSSE41()) {
9684 // SSE41 can compare v2i64 - select between indices 0 and 1.
9685 return DAG.getSelectCC(
9686 DL, IndicesVec,
9687 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9688 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9689 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9690 ISD::CondCode::SETEQ);
9691 }
9692 break;
9693 case MVT::v32i8:
9694 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9695 Opcode = X86ISD::VPERMV;
9696 else if (Subtarget.hasXOP()) {
9697 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9698 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9699 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9700 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9701 return DAG.getNode(
9702 ISD::CONCAT_VECTORS, DL, VT,
9703 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9704 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9705 } else if (Subtarget.hasAVX()) {
9706 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9707 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9708 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9709 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9710 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9711 ArrayRef<SDValue> Ops) {
9712 // Permute Lo and Hi and then select based on index range.
9713 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9714 // care about the bit[7] as its just an index vector.
9715 SDValue Idx = Ops[2];
9716 EVT VT = Idx.getValueType();
9717 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9718 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9719 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9720 ISD::CondCode::SETGT);
9721 };
9722 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9723 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9724 PSHUFBBuilder);
9725 }
9726 break;
9727 case MVT::v16i16:
9728 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9729 Opcode = X86ISD::VPERMV;
9730 else if (Subtarget.hasAVX()) {
9731 // Scale to v32i8 and perform as v32i8.
9732 IndicesVec = ScaleIndices(IndicesVec, 2);
9733 return DAG.getBitcast(
9734 VT, createVariablePermute(
9735 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9736 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9737 }
9738 break;
9739 case MVT::v8f32:
9740 case MVT::v8i32:
9741 if (Subtarget.hasAVX2())
9742 Opcode = X86ISD::VPERMV;
9743 else if (Subtarget.hasAVX()) {
9744 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9745 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9746 {0, 1, 2, 3, 0, 1, 2, 3});
9747 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9748 {4, 5, 6, 7, 4, 5, 6, 7});
9749 if (Subtarget.hasXOP())
9750 return DAG.getBitcast(
9751 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9752 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9753 // Permute Lo and Hi and then select based on index range.
9754 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9755 SDValue Res = DAG.getSelectCC(
9756 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9757 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9758 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9759 ISD::CondCode::SETGT);
9760 return DAG.getBitcast(VT, Res);
9761 }
9762 break;
9763 case MVT::v4i64:
9764 case MVT::v4f64:
9765 if (Subtarget.hasAVX512()) {
9766 if (!Subtarget.hasVLX()) {
9767 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9768 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9769 SDLoc(SrcVec));
9770 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9771 DAG, SDLoc(IndicesVec));
9772 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9773 DAG, Subtarget);
9774 return extract256BitVector(Res, 0, DAG, DL);
9775 }
9776 Opcode = X86ISD::VPERMV;
9777 } else if (Subtarget.hasAVX()) {
9778 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9779 SDValue LoLo =
9780 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9781 SDValue HiHi =
9782 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9783 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9784 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9785 if (Subtarget.hasXOP())
9786 return DAG.getBitcast(
9787 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9788 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9789 // Permute Lo and Hi and then select based on index range.
9790 // This works as VPERMILPD only uses index bit[1] to permute elements.
9791 SDValue Res = DAG.getSelectCC(
9792 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9793 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9794 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9795 ISD::CondCode::SETGT);
9796 return DAG.getBitcast(VT, Res);
9797 }
9798 break;
9799 case MVT::v64i8:
9800 if (Subtarget.hasVBMI())
9801 Opcode = X86ISD::VPERMV;
9802 break;
9803 case MVT::v32i16:
9804 if (Subtarget.hasBWI())
9805 Opcode = X86ISD::VPERMV;
9806 break;
9807 case MVT::v16f32:
9808 case MVT::v16i32:
9809 case MVT::v8f64:
9810 case MVT::v8i64:
9811 if (Subtarget.hasAVX512())
9812 Opcode = X86ISD::VPERMV;
9813 break;
9814 }
9815 if (!Opcode)
9816 return SDValue();
9817
9818 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9820, __PRETTY_FUNCTION__))
9819 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9820, __PRETTY_FUNCTION__))
9820 "Illegal variable permute shuffle type")(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9820, __PRETTY_FUNCTION__))
;
9821
9822 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9823 if (Scale > 1)
9824 IndicesVec = ScaleIndices(IndicesVec, Scale);
9825
9826 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9827 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9828
9829 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9830 SDValue Res = Opcode == X86ISD::VPERMV
9831 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9832 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9833 return DAG.getBitcast(VT, Res);
9834}
9835
9836// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9837// reasoned to be a permutation of a vector by indices in a non-constant vector.
9838// (build_vector (extract_elt V, (extract_elt I, 0)),
9839// (extract_elt V, (extract_elt I, 1)),
9840// ...
9841// ->
9842// (vpermv I, V)
9843//
9844// TODO: Handle undefs
9845// TODO: Utilize pshufb and zero mask blending to support more efficient
9846// construction of vectors with constant-0 elements.
9847static SDValue
9848LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
9849 const X86Subtarget &Subtarget) {
9850 SDValue SrcVec, IndicesVec;
9851 // Check for a match of the permute source vector and permute index elements.
9852 // This is done by checking that the i-th build_vector operand is of the form:
9853 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9854 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9855 SDValue Op = V.getOperand(Idx);
9856 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9857 return SDValue();
9858
9859 // If this is the first extract encountered in V, set the source vector,
9860 // otherwise verify the extract is from the previously defined source
9861 // vector.
9862 if (!SrcVec)
9863 SrcVec = Op.getOperand(0);
9864 else if (SrcVec != Op.getOperand(0))
9865 return SDValue();
9866 SDValue ExtractedIndex = Op->getOperand(1);
9867 // Peek through extends.
9868 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9869 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9870 ExtractedIndex = ExtractedIndex.getOperand(0);
9871 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9872 return SDValue();
9873
9874 // If this is the first extract from the index vector candidate, set the
9875 // indices vector, otherwise verify the extract is from the previously
9876 // defined indices vector.
9877 if (!IndicesVec)
9878 IndicesVec = ExtractedIndex.getOperand(0);
9879 else if (IndicesVec != ExtractedIndex.getOperand(0))
9880 return SDValue();
9881
9882 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9883 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9884 return SDValue();
9885 }
9886
9887 SDLoc DL(V);
9888 MVT VT = V.getSimpleValueType();
9889 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9890}
9891
9892SDValue
9893X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9894 SDLoc dl(Op);
9895
9896 MVT VT = Op.getSimpleValueType();
9897 MVT EltVT = VT.getVectorElementType();
9898 unsigned NumElems = Op.getNumOperands();
9899
9900 // Generate vectors for predicate vectors.
9901 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9902 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
9903
9904 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
9905 return VectorConstant;
9906
9907 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9908 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
9909 return AddSub;
9910 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
9911 return HorizontalOp;
9912 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
9913 return Broadcast;
9914 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
9915 return BitOp;
9916
9917 unsigned EVTBits = EltVT.getSizeInBits();
9918
9919 unsigned NumZero = 0;
9920 unsigned NumNonZero = 0;
9921 uint64_t NonZeros = 0;
9922 bool IsAllConstants = true;
9923 SmallSet<SDValue, 8> Values;
9924 unsigned NumConstants = NumElems;
9925 for (unsigned i = 0; i < NumElems; ++i) {
9926 SDValue Elt = Op.getOperand(i);
9927 if (Elt.isUndef())
9928 continue;
9929 Values.insert(Elt);
9930 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
9931 IsAllConstants = false;
9932 NumConstants--;
9933 }
9934 if (X86::isZeroNode(Elt))
9935 NumZero++;
9936 else {
9937 assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9937, __PRETTY_FUNCTION__))
; // Make sure the shift is within range.
9938 NonZeros |= ((uint64_t)1 << i);
9939 NumNonZero++;
9940 }
9941 }
9942
9943 // All undef vector. Return an UNDEF. All zero vectors were handled above.
9944 if (NumNonZero == 0)
9945 return DAG.getUNDEF(VT);
9946
9947 // If we are inserting one variable into a vector of non-zero constants, try
9948 // to avoid loading each constant element as a scalar. Load the constants as a
9949 // vector and then insert the variable scalar element. If insertion is not
9950 // supported, fall back to a shuffle to get the scalar blended with the
9951 // constants. Insertion into a zero vector is handled as a special-case
9952 // somewhere below here.
9953 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9954 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
9955 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
9956 // Create an all-constant vector. The variable element in the old
9957 // build vector is replaced by undef in the constant vector. Save the
9958 // variable scalar element and its index for use in the insertelement.
9959 LLVMContext &Context = *DAG.getContext();
9960 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9961 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9962 SDValue VarElt;
9963 SDValue InsIndex;
9964 for (unsigned i = 0; i != NumElems; ++i) {
9965 SDValue Elt = Op.getOperand(i);
9966 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9967 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9968 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9969 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9970 else if (!Elt.isUndef()) {
9971 assert(!VarElt.getNode() && !InsIndex.getNode() &&((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9972, __PRETTY_FUNCTION__))
9972 "Expected one variable element in this vector")((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9972, __PRETTY_FUNCTION__))
;
9973 VarElt = Elt;
9974 InsIndex = DAG.getVectorIdxConstant(i, dl);
9975 }
9976 }
9977 Constant *CV = ConstantVector::get(ConstVecOps);
9978 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9979
9980 // The constants we just created may not be legal (eg, floating point). We
9981 // must lower the vector right here because we can not guarantee that we'll
9982 // legalize it before loading it. This is also why we could not just create
9983 // a new build vector here. If the build vector contains illegal constants,
9984 // it could get split back up into a series of insert elements.
9985 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9986 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9987 MachineFunction &MF = DAG.getMachineFunction();
9988 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9989 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9990 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
9991 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9992 if (InsertC < NumEltsInLow128Bits)
9993 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9994
9995 // There's no good way to insert into the high elements of a >128-bit
9996 // vector, so use shuffles to avoid an extract/insert sequence.
9997 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((VT.getSizeInBits() > 128 && "Invalid insertion index?"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9997, __PRETTY_FUNCTION__))
;
9998 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9998, __PRETTY_FUNCTION__))
;
9999 SmallVector<int, 8> ShuffleMask;
10000 unsigned NumElts = VT.getVectorNumElements();
10001 for (unsigned i = 0; i != NumElts; ++i)
10002 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10003 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10004 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10005 }
10006
10007 // Special case for single non-zero, non-undef, element.
10008 if (NumNonZero == 1) {
10009 unsigned Idx = countTrailingZeros(NonZeros);
10010 SDValue Item = Op.getOperand(Idx);
10011
10012 // If we have a constant or non-constant insertion into the low element of
10013 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10014 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10015 // depending on what the source datatype is.
10016 if (Idx == 0) {
10017 if (NumZero == 0)
10018 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10019
10020 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10021 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10022 assert((VT.is128BitVector() || VT.is256BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10024, __PRETTY_FUNCTION__))
10023 VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10024, __PRETTY_FUNCTION__))
10024 "Expected an SSE value type!")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10024, __PRETTY_FUNCTION__))
;
10025 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10026 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10027 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10028 }
10029
10030 // We can't directly insert an i8 or i16 into a vector, so zero extend
10031 // it to i32 first.
10032 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10033 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10034 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
10035 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10036 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10037 return DAG.getBitcast(VT, Item);
10038 }
10039 }
10040
10041 // Is it a vector logical left shift?
10042 if (NumElems == 2 && Idx == 1 &&
10043 X86::isZeroNode(Op.getOperand(0)) &&
10044 !X86::isZeroNode(Op.getOperand(1))) {
10045 unsigned NumBits = VT.getSizeInBits();
10046 return getVShift(true, VT,
10047 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10048 VT, Op.getOperand(1)),
10049 NumBits/2, DAG, *this, dl);
10050 }
10051
10052 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10053 return SDValue();
10054
10055 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10056 // is a non-constant being inserted into an element other than the low one,
10057 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10058 // movd/movss) to move this into the low element, then shuffle it into
10059 // place.
10060 if (EVTBits == 32) {
10061 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10062 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10063 }
10064 }
10065
10066 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10067 if (Values.size() == 1) {
10068 if (EVTBits == 32) {
10069 // Instead of a shuffle like this:
10070 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10071 // Check if it's possible to issue this instead.
10072 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10073 unsigned Idx = countTrailingZeros(NonZeros);
10074 SDValue Item = Op.getOperand(Idx);
10075 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10076 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10077 }
10078 return SDValue();
10079 }
10080
10081 // A vector full of immediates; various special cases are already
10082 // handled, so this is best done with a single constant-pool load.
10083 if (IsAllConstants)
10084 return SDValue();
10085
10086 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10087 return V;
10088
10089 // See if we can use a vector load to get all of the elements.
10090 {
10091 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10092 if (SDValue LD =
10093 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10094 return LD;
10095 }
10096
10097 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10098 // build_vector and broadcast it.
10099 // TODO: We could probably generalize this more.
10100 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10101 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10102 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10103 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10104 // Make sure all the even/odd operands match.
10105 for (unsigned i = 2; i != NumElems; ++i)
10106 if (Ops[i % 2] != Op.getOperand(i))
10107 return false;
10108 return true;
10109 };
10110 if (CanSplat(Op, NumElems, Ops)) {
10111 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10112 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10113 // Create a new build vector and cast to v2i64/v2f64.
10114 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10115 DAG.getBuildVector(NarrowVT, dl, Ops));
10116 // Broadcast from v2i64/v2f64 and cast to final VT.
10117 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
10118 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10119 NewBV));
10120 }
10121 }
10122
10123 // For AVX-length vectors, build the individual 128-bit pieces and use
10124 // shuffles to put them in place.
10125 if (VT.getSizeInBits() > 128) {
10126 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
10127
10128 // Build both the lower and upper subvector.
10129 SDValue Lower =
10130 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10131 SDValue Upper = DAG.getBuildVector(
10132 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10133
10134 // Recreate the wider vector with the lower and upper part.
10135 return concatSubVectors(Lower, Upper, DAG, dl);
10136 }
10137
10138 // Let legalizer expand 2-wide build_vectors.
10139 if (EVTBits == 64) {
10140 if (NumNonZero == 1) {
10141 // One half is zero or undef.
10142 unsigned Idx = countTrailingZeros(NonZeros);
10143 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10144 Op.getOperand(Idx));
10145 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10146 }
10147 return SDValue();
10148 }
10149
10150 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10151 if (EVTBits == 8 && NumElems == 16)
10152 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
10153 DAG, Subtarget))
10154 return V;
10155
10156 if (EVTBits == 16 && NumElems == 8)
10157 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
10158 DAG, Subtarget))
10159 return V;
10160
10161 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10162 if (EVTBits == 32 && NumElems == 4)
10163 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10164 return V;
10165
10166 // If element VT is == 32 bits, turn it into a number of shuffles.
10167 if (NumElems == 4 && NumZero > 0) {
10168 SmallVector<SDValue, 8> Ops(NumElems);
10169 for (unsigned i = 0; i < 4; ++i) {
10170 bool isZero = !(NonZeros & (1ULL << i));
10171 if (isZero)
10172 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10173 else
10174 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10175 }
10176
10177 for (unsigned i = 0; i < 2; ++i) {
10178 switch ((NonZeros >> (i*2)) & 0x3) {
10179 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10179)
;
10180 case 0:
10181 Ops[i] = Ops[i*2]; // Must be a zero vector.
10182 break;
10183 case 1:
10184 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10185 break;
10186 case 2:
10187 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10188 break;
10189 case 3:
10190 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10191 break;
10192 }
10193 }
10194
10195 bool Reverse1 = (NonZeros & 0x3) == 2;
10196 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
10197 int MaskVec[] = {
10198 Reverse1 ? 1 : 0,
10199 Reverse1 ? 0 : 1,
10200 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10201 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10202 };
10203 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10204 }
10205
10206 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? static_cast<void> (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10206, __PRETTY_FUNCTION__))
;
10207
10208 // Check for a build vector from mostly shuffle plus few inserting.
10209 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10210 return Sh;
10211
10212 // For SSE 4.1, use insertps to put the high elements into the low element.
10213 if (Subtarget.hasSSE41()) {
10214 SDValue Result;
10215 if (!Op.getOperand(0).isUndef())
10216 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10217 else
10218 Result = DAG.getUNDEF(VT);
10219
10220 for (unsigned i = 1; i < NumElems; ++i) {
10221 if (Op.getOperand(i).isUndef()) continue;
10222 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10223 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10224 }
10225 return Result;
10226 }
10227
10228 // Otherwise, expand into a number of unpckl*, start by extending each of
10229 // our (non-undef) elements to the full vector width with the element in the
10230 // bottom slot of the vector (which generates no code for SSE).
10231 SmallVector<SDValue, 8> Ops(NumElems);
10232 for (unsigned i = 0; i < NumElems; ++i) {
10233 if (!Op.getOperand(i).isUndef())
10234 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10235 else
10236 Ops[i] = DAG.getUNDEF(VT);
10237 }
10238
10239 // Next, we iteratively mix elements, e.g. for v4f32:
10240 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10241 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10242 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10243 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10244 // Generate scaled UNPCKL shuffle mask.
10245 SmallVector<int, 16> Mask;
10246 for(unsigned i = 0; i != Scale; ++i)
10247 Mask.push_back(i);
10248 for (unsigned i = 0; i != Scale; ++i)
10249 Mask.push_back(NumElems+i);
10250 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10251
10252 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10253 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10254 }
10255 return Ops[0];
10256}
10257
10258// 256-bit AVX can use the vinsertf128 instruction
10259// to create 256-bit vectors from two other 128-bit ones.
10260// TODO: Detect subvector broadcast here instead of DAG combine?
10261static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10262 const X86Subtarget &Subtarget) {
10263 SDLoc dl(Op);
10264 MVT ResVT = Op.getSimpleValueType();
10265
10266 assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10267, __PRETTY_FUNCTION__))
10267 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10267, __PRETTY_FUNCTION__))
;
10268
10269 unsigned NumOperands = Op.getNumOperands();
10270 unsigned NumZero = 0;
10271 unsigned NumNonZero = 0;
10272 unsigned NonZeros = 0;
10273 for (unsigned i = 0; i != NumOperands; ++i) {
10274 SDValue SubVec = Op.getOperand(i);
10275 if (SubVec.isUndef())
10276 continue;
10277 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10278 ++NumZero;
10279 else {
10280 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10280, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10281 NonZeros |= 1 << i;
10282 ++NumNonZero;
10283 }
10284 }
10285
10286 // If we have more than 2 non-zeros, build each half separately.
10287 if (NumNonZero > 2) {
10288 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10289 ArrayRef<SDUse> Ops = Op->ops();
10290 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10291 Ops.slice(0, NumOperands/2));
10292 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10293 Ops.slice(NumOperands/2));
10294 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10295 }
10296
10297 // Otherwise, build it up through insert_subvectors.
10298 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10299 : DAG.getUNDEF(ResVT);
10300
10301 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10302 unsigned NumSubElems = SubVT.getVectorNumElements();
10303 for (unsigned i = 0; i != NumOperands; ++i) {
10304 if ((NonZeros & (1 << i)) == 0)
10305 continue;
10306
10307 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10308 Op.getOperand(i),
10309 DAG.getIntPtrConstant(i * NumSubElems, dl));
10310 }
10311
10312 return Vec;
10313}
10314
10315// Returns true if the given node is a type promotion (by concatenating i1
10316// zeros) of the result of a node that already zeros all upper bits of
10317// k-register.
10318// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10319static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10320 const X86Subtarget &Subtarget,
10321 SelectionDAG & DAG) {
10322 SDLoc dl(Op);
10323 MVT ResVT = Op.getSimpleValueType();
10324 unsigned NumOperands = Op.getNumOperands();
10325
10326 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10327, __PRETTY_FUNCTION__))
10327 "Unexpected number of operands in CONCAT_VECTORS")((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10327, __PRETTY_FUNCTION__))
;
10328
10329 uint64_t Zeros = 0;
10330 uint64_t NonZeros = 0;
10331 for (unsigned i = 0; i != NumOperands; ++i) {
10332 SDValue SubVec = Op.getOperand(i);
10333 if (SubVec.isUndef())
10334 continue;
10335 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10335, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10336 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10337 Zeros |= (uint64_t)1 << i;
10338 else
10339 NonZeros |= (uint64_t)1 << i;
10340 }
10341
10342 unsigned NumElems = ResVT.getVectorNumElements();
10343
10344 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10345 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10346 // insert_subvector will give us two kshifts.
10347 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10348 Log2_64(NonZeros) != NumOperands - 1) {
10349 MVT ShiftVT = ResVT;
10350 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10351 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10352 unsigned Idx = Log2_64(NonZeros);
10353 SDValue SubVec = Op.getOperand(Idx);
10354 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10355 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10356 DAG.getUNDEF(ShiftVT), SubVec,
10357 DAG.getIntPtrConstant(0, dl));
10358 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10359 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10360 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10361 DAG.getIntPtrConstant(0, dl));
10362 }
10363
10364 // If there are zero or one non-zeros we can handle this very simply.
10365 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10366 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10367 if (!NonZeros)
10368 return Vec;
10369 unsigned Idx = Log2_64(NonZeros);
10370 SDValue SubVec = Op.getOperand(Idx);
10371 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10372 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10373 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10374 }
10375
10376 if (NumOperands > 2) {
10377 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10378 ArrayRef<SDUse> Ops = Op->ops();
10379 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10380 Ops.slice(0, NumOperands/2));
10381 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10382 Ops.slice(NumOperands/2));
10383 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10384 }
10385
10386 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((countPopulation(NonZeros) == 2 && "Simple cases not handled?"
) ? static_cast<void> (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10386, __PRETTY_FUNCTION__))
;
10387
10388 if (ResVT.getVectorNumElements() >= 16)
10389 return Op; // The operation is legal with KUNPCK
10390
10391 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10392 DAG.getUNDEF(ResVT), Op.getOperand(0),
10393 DAG.getIntPtrConstant(0, dl));
10394 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10395 DAG.getIntPtrConstant(NumElems/2, dl));
10396}
10397
10398static SDValue LowerCONCAT_VECTORS(SDValue Op,
10399 const X86Subtarget &Subtarget,
10400 SelectionDAG &DAG) {
10401 MVT VT = Op.getSimpleValueType();
10402 if (VT.getVectorElementType() == MVT::i1)
10403 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10404
10405 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10407, __PRETTY_FUNCTION__))
10406 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10407, __PRETTY_FUNCTION__))
10407 Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10407, __PRETTY_FUNCTION__))
;
10408
10409 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10410 // from two other 128-bit ones.
10411
10412 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10413 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10414}
10415
10416//===----------------------------------------------------------------------===//
10417// Vector shuffle lowering
10418//
10419// This is an experimental code path for lowering vector shuffles on x86. It is
10420// designed to handle arbitrary vector shuffles and blends, gracefully
10421// degrading performance as necessary. It works hard to recognize idiomatic
10422// shuffles and lower them to optimal instruction patterns without leaving
10423// a framework that allows reasonably efficient handling of all vector shuffle
10424// patterns.
10425//===----------------------------------------------------------------------===//
10426
10427/// Tiny helper function to identify a no-op mask.
10428///
10429/// This is a somewhat boring predicate function. It checks whether the mask
10430/// array input, which is assumed to be a single-input shuffle mask of the kind
10431/// used by the X86 shuffle instructions (not a fully general
10432/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10433/// in-place shuffle are 'no-op's.
10434static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10435 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10436 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10436, __PRETTY_FUNCTION__))
;
10437 if (Mask[i] >= 0 && Mask[i] != i)
10438 return false;
10439 }
10440 return true;
10441}
10442
10443/// Test whether there are elements crossing LaneSizeInBits lanes in this
10444/// shuffle mask.
10445///
10446/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10447/// and we routinely test for these.
10448static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10449 unsigned ScalarSizeInBits,
10450 ArrayRef<int> Mask) {
10451 assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10453, __PRETTY_FUNCTION__))
10452 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10453, __PRETTY_FUNCTION__))
10453 "Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10453, __PRETTY_FUNCTION__))
;
10454 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10455 int Size = Mask.size();
10456 for (int i = 0; i < Size; ++i)
10457 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10458 return true;
10459 return false;
10460}
10461
10462/// Test whether there are elements crossing 128-bit lanes in this
10463/// shuffle mask.
10464static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10465 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10466}
10467
10468/// Test whether a shuffle mask is equivalent within each sub-lane.
10469///
10470/// This checks a shuffle mask to see if it is performing the same
10471/// lane-relative shuffle in each sub-lane. This trivially implies
10472/// that it is also not lane-crossing. It may however involve a blend from the
10473/// same lane of a second vector.
10474///
10475/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10476/// non-trivial to compute in the face of undef lanes. The representation is
10477/// suitable for use with existing 128-bit shuffles as entries from the second
10478/// vector have been remapped to [LaneSize, 2*LaneSize).
10479static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10480 ArrayRef<int> Mask,
10481 SmallVectorImpl<int> &RepeatedMask) {
10482 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10483 RepeatedMask.assign(LaneSize, -1);
10484 int Size = Mask.size();
10485 for (int i = 0; i < Size; ++i) {
10486 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((Mask[i] == SM_SentinelUndef || Mask[i] >= 0) ? static_cast
<void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10486, __PRETTY_FUNCTION__))
;
10487 if (Mask[i] < 0)
10488 continue;
10489 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10490 // This entry crosses lanes, so there is no way to model this shuffle.
10491 return false;
10492
10493 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10494 // Adjust second vector indices to start at LaneSize instead of Size.
10495 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10496 : Mask[i] % LaneSize + LaneSize;
10497 if (RepeatedMask[i % LaneSize] < 0)
10498 // This is the first non-undef entry in this slot of a 128-bit lane.
10499 RepeatedMask[i % LaneSize] = LocalM;
10500 else if (RepeatedMask[i % LaneSize] != LocalM)
10501 // Found a mismatch with the repeated mask.
10502 return false;
10503 }
10504 return true;
10505}
10506
10507/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10508static bool
10509is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10510 SmallVectorImpl<int> &RepeatedMask) {
10511 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10512}
10513
10514static bool
10515is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10516 SmallVector<int, 32> RepeatedMask;
10517 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10518}
10519
10520/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10521static bool
10522is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10523 SmallVectorImpl<int> &RepeatedMask) {
10524 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10525}
10526
10527/// Test whether a target shuffle mask is equivalent within each sub-lane.
10528/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10529static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10530 ArrayRef<int> Mask,
10531 SmallVectorImpl<int> &RepeatedMask) {
10532 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10533 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10534 int Size = Mask.size();
10535 for (int i = 0; i < Size; ++i) {
10536 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? static_cast<
void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10536, __PRETTY_FUNCTION__))
;
10537 if (Mask[i] == SM_SentinelUndef)
10538 continue;
10539 if (Mask[i] == SM_SentinelZero) {
10540 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10541 return false;
10542 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10543 continue;
10544 }
10545 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10546 // This entry crosses lanes, so there is no way to model this shuffle.
10547 return false;
10548
10549 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10550 // Adjust second vector indices to start at LaneSize instead of Size.
10551 int LocalM =
10552 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
10553 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10554 // This is the first non-undef entry in this slot of a 128-bit lane.
10555 RepeatedMask[i % LaneSize] = LocalM;
10556 else if (RepeatedMask[i % LaneSize] != LocalM)
10557 // Found a mismatch with the repeated mask.
10558 return false;
10559 }
10560 return true;
10561}
10562
10563/// Checks whether a shuffle mask is equivalent to an explicit list of
10564/// arguments.
10565///
10566/// This is a fast way to test a shuffle mask against a fixed pattern:
10567///
10568/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10569///
10570/// It returns true if the mask is exactly as wide as the argument list, and
10571/// each element of the mask is either -1 (signifying undef) or the value given
10572/// in the argument.
10573static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
10574 ArrayRef<int> ExpectedMask) {
10575 if (Mask.size() != ExpectedMask.size())
10576 return false;
10577
10578 int Size = Mask.size();
10579
10580 // If the values are build vectors, we can look through them to find
10581 // equivalent inputs that make the shuffles equivalent.
10582 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
10583 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
10584
10585 for (int i = 0; i < Size; ++i) {
10586 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __PRETTY_FUNCTION__))
;
10587 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
10588 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10589 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10590 if (!MaskBV || !ExpectedBV ||
10591 MaskBV->getOperand(Mask[i] % Size) !=
10592 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10593 return false;
10594 }
10595 }
10596
10597 return true;
10598}
10599
10600/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10601///
10602/// The masks must be exactly the same width.
10603///
10604/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10605/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10606///
10607/// SM_SentinelZero is accepted as a valid negative index but must match in
10608/// both.
10609static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
10610 ArrayRef<int> ExpectedMask,
10611 SDValue V1 = SDValue(),
10612 SDValue V2 = SDValue()) {
10613 int Size = Mask.size();
10614 if (Size != (int)ExpectedMask.size())
10615 return false;
10616 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10617, __PRETTY_FUNCTION__))
10617 "Illegal target shuffle mask")((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10617, __PRETTY_FUNCTION__))
;
10618
10619 // Check for out-of-range target shuffle mask indices.
10620 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10621 return false;
10622
10623 // If the values are build vectors, we can look through them to find
10624 // equivalent inputs that make the shuffles equivalent.
10625 auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
10626 auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
10627 BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
10628 BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
10629
10630 for (int i = 0; i < Size; ++i) {
10631 if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
10632 continue;
10633 if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
10634 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10635 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10636 if (MaskBV && ExpectedBV &&
10637 MaskBV->getOperand(Mask[i] % Size) ==
10638 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10639 continue;
10640 }
10641 // TODO - handle SM_Sentinel equivalences.
10642 return false;
10643 }
10644 return true;
10645}
10646
10647// Attempt to create a shuffle mask from a VSELECT condition mask.
10648static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
10649 SDValue Cond) {
10650 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
10651 return false;
10652
10653 unsigned Size = Cond.getValueType().getVectorNumElements();
10654 Mask.resize(Size, SM_SentinelUndef);
10655
10656 for (int i = 0; i != (int)Size; ++i) {
10657 SDValue CondElt = Cond.getOperand(i);
10658 Mask[i] = i;
10659 // Arbitrarily choose from the 2nd operand if the select condition element
10660 // is undef.
10661 // TODO: Can we do better by matching patterns such as even/odd?
10662 if (CondElt.isUndef() || isNullConstant(CondElt))
10663 Mask[i] += Size;
10664 }
10665
10666 return true;
10667}
10668
10669// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10670// instructions.
10671static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
10672 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10673 return false;
10674
10675 SmallVector<int, 8> Unpcklwd;
10676 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10677 /* Unary = */ false);
10678 SmallVector<int, 8> Unpckhwd;
10679 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10680 /* Unary = */ false);
10681 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
10682 isTargetShuffleEquivalent(Mask, Unpckhwd));
10683 return IsUnpackwdMask;
10684}
10685
10686static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
10687 // Create 128-bit vector type based on mask size.
10688 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10689 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10690
10691 // We can't assume a canonical shuffle mask, so try the commuted version too.
10692 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10693 ShuffleVectorSDNode::commuteMask(CommutedMask);
10694
10695 // Match any of unary/binary or low/high.
10696 for (unsigned i = 0; i != 4; ++i) {
10697 SmallVector<int, 16> UnpackMask;
10698 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10699 if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
10700 isTargetShuffleEquivalent(CommutedMask, UnpackMask))
10701 return true;
10702 }
10703 return false;
10704}
10705
10706/// Return true if a shuffle mask chooses elements identically in its top and
10707/// bottom halves. For example, any splat mask has the same top and bottom
10708/// halves. If an element is undefined in only one half of the mask, the halves
10709/// are not considered identical.
10710static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
10711 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10711, __PRETTY_FUNCTION__))
;
10712 unsigned HalfSize = Mask.size() / 2;
10713 for (unsigned i = 0; i != HalfSize; ++i) {
10714 if (Mask[i] != Mask[i + HalfSize])
10715 return false;
10716 }
10717 return true;
10718}
10719
10720/// Get a 4-lane 8-bit shuffle immediate for a mask.
10721///
10722/// This helper function produces an 8-bit shuffle immediate corresponding to
10723/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10724/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10725/// example.
10726///
10727/// NB: We rely heavily on "undef" masks preserving the input lane.
10728static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10729 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10729, __PRETTY_FUNCTION__))
;
10730 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10730, __PRETTY_FUNCTION__))
;
10731 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10731, __PRETTY_FUNCTION__))
;
10732 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10732, __PRETTY_FUNCTION__))
;
10733 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10733, __PRETTY_FUNCTION__))
;
10734
10735 unsigned Imm = 0;
10736 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10737 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10738 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10739 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10740 return Imm;
10741}
10742
10743static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
10744 SelectionDAG &DAG) {
10745 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10746}
10747
10748// The Shuffle result is as follow:
10749// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10750// Each Zeroable's element correspond to a particular Mask's element.
10751// As described in computeZeroableShuffleElements function.
10752//
10753// The function looks for a sub-mask that the nonzero elements are in
10754// increasing order. If such sub-mask exist. The function returns true.
10755static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10756 ArrayRef<int> Mask, const EVT &VectorType,
10757 bool &IsZeroSideLeft) {
10758 int NextElement = -1;
10759 // Check if the Mask's nonzero elements are in increasing order.
10760 for (int i = 0, e = Mask.size(); i < e; i++) {
10761 // Checks if the mask's zeros elements are built from only zeros.
10762 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10762, __PRETTY_FUNCTION__))
;
10763 if (Mask[i] < 0)
10764 return false;
10765 if (Zeroable[i])
10766 continue;
10767 // Find the lowest non zero element
10768 if (NextElement < 0) {
10769 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10770 IsZeroSideLeft = NextElement != 0;
10771 }
10772 // Exit if the mask's non zero elements are not in increasing order.
10773 if (NextElement != Mask[i])
10774 return false;
10775 NextElement++;
10776 }
10777 return true;
10778}
10779
10780/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10781static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
10782 ArrayRef<int> Mask, SDValue V1,
10783 SDValue V2, const APInt &Zeroable,
10784 const X86Subtarget &Subtarget,
10785 SelectionDAG &DAG) {
10786 int Size = Mask.size();
10787 int LaneSize = 128 / VT.getScalarSizeInBits();
10788 const int NumBytes = VT.getSizeInBits() / 8;
10789 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10790
10791 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10793, __PRETTY_FUNCTION__))
10792 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10793, __PRETTY_FUNCTION__))
10793 (Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10793, __PRETTY_FUNCTION__))
;
10794
10795 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10796 // Sign bit set in i8 mask means zero element.
10797 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10798
10799 SDValue V;
10800 for (int i = 0; i < NumBytes; ++i) {
10801 int M = Mask[i / NumEltBytes];
10802 if (M < 0) {
10803 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10804 continue;
10805 }
10806 if (Zeroable[i / NumEltBytes]) {
10807 PSHUFBMask[i] = ZeroMask;
10808 continue;
10809 }
10810
10811 // We can only use a single input of V1 or V2.
10812 SDValue SrcV = (M >= Size ? V2 : V1);
10813 if (V && V != SrcV)
10814 return SDValue();
10815 V = SrcV;
10816 M %= Size;
10817
10818 // PSHUFB can't cross lanes, ensure this doesn't happen.
10819 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10820 return SDValue();
10821
10822 M = M % LaneSize;
10823 M = M * NumEltBytes + (i % NumEltBytes);
10824 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10825 }
10826 assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast
<void> (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10826, __PRETTY_FUNCTION__))
;
10827
10828 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10829 return DAG.getBitcast(
10830 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10831 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10832}
10833
10834static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10835 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10836 const SDLoc &dl);
10837
10838// X86 has dedicated shuffle that can be lowered to VEXPAND
10839static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
10840 const APInt &Zeroable,
10841 ArrayRef<int> Mask, SDValue &V1,
10842 SDValue &V2, SelectionDAG &DAG,
10843 const X86Subtarget &Subtarget) {
10844 bool IsLeftZeroSide = true;
10845 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10846 IsLeftZeroSide))
10847 return SDValue();
10848 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10849 MVT IntegerType =
10850 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10851 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10852 unsigned NumElts = VT.getVectorNumElements();
10853 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10854, __PRETTY_FUNCTION__))
10854 "Unexpected number of vector elements")(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10854, __PRETTY_FUNCTION__))
;
10855 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10856 Subtarget, DAG, DL);
10857 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10858 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10859 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10860}
10861
10862static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10863 unsigned &UnpackOpcode, bool IsUnary,
10864 ArrayRef<int> TargetMask, const SDLoc &DL,
10865 SelectionDAG &DAG,
10866 const X86Subtarget &Subtarget) {
10867 int NumElts = VT.getVectorNumElements();
10868
10869 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10870 for (int i = 0; i != NumElts; i += 2) {
10871 int M1 = TargetMask[i + 0];
10872 int M2 = TargetMask[i + 1];
10873 Undef1 &= (SM_SentinelUndef == M1);
10874 Undef2 &= (SM_SentinelUndef == M2);
10875 Zero1 &= isUndefOrZero(M1);
10876 Zero2 &= isUndefOrZero(M2);
10877 }
10878 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10879, __PRETTY_FUNCTION__))
10879 "Zeroable shuffle detected")((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10879, __PRETTY_FUNCTION__))
;
10880
10881 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10882 SmallVector<int, 64> Unpckl, Unpckh;
10883 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10884 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10885 UnpackOpcode = X86ISD::UNPCKL;
10886 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10887 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10888 return true;
10889 }
10890
10891 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10892 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10893 UnpackOpcode = X86ISD::UNPCKH;
10894 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10895 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10896 return true;
10897 }
10898
10899 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10900 if (IsUnary && (Zero1 || Zero2)) {
10901 // Don't bother if we can blend instead.
10902 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10903 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10904 return false;
10905
10906 bool MatchLo = true, MatchHi = true;
10907 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10908 int M = TargetMask[i];
10909
10910 // Ignore if the input is known to be zero or the index is undef.
10911 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10912 (M == SM_SentinelUndef))
10913 continue;
10914
10915 MatchLo &= (M == Unpckl[i]);
10916 MatchHi &= (M == Unpckh[i]);
10917 }
10918
10919 if (MatchLo || MatchHi) {
10920 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10921 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10922 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10923 return true;
10924 }
10925 }
10926
10927 // If a binary shuffle, commute and try again.
10928 if (!IsUnary) {
10929 ShuffleVectorSDNode::commuteMask(Unpckl);
10930 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10931 UnpackOpcode = X86ISD::UNPCKL;
10932 std::swap(V1, V2);
10933 return true;
10934 }
10935
10936 ShuffleVectorSDNode::commuteMask(Unpckh);
10937 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10938 UnpackOpcode = X86ISD::UNPCKH;
10939 std::swap(V1, V2);
10940 return true;
10941 }
10942 }
10943
10944 return false;
10945}
10946
10947// X86 has dedicated unpack instructions that can handle specific blend
10948// operations: UNPCKH and UNPCKL.
10949static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
10950 ArrayRef<int> Mask, SDValue V1, SDValue V2,
10951 SelectionDAG &DAG) {
10952 SmallVector<int, 8> Unpckl;
10953 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10954 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10955 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10956
10957 SmallVector<int, 8> Unpckh;
10958 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10959 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10960 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10961
10962 // Commute and try again.
10963 ShuffleVectorSDNode::commuteMask(Unpckl);
10964 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10965 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10966
10967 ShuffleVectorSDNode::commuteMask(Unpckh);
10968 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10969 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10970
10971 return SDValue();
10972}
10973
10974/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10975/// followed by unpack 256-bit.
10976static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
10977 ArrayRef<int> Mask, SDValue V1,
10978 SDValue V2, SelectionDAG &DAG) {
10979 SmallVector<int, 32> Unpckl, Unpckh;
10980 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10981 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10982
10983 unsigned UnpackOpcode;
10984 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
10985 UnpackOpcode = X86ISD::UNPCKL;
10986 else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
10987 UnpackOpcode = X86ISD::UNPCKH;
10988 else
10989 return SDValue();
10990
10991 // This is a "natural" unpack operation (rather than the 128-bit sectored
10992 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10993 // input in order to use the x86 instruction.
10994 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10995 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10996 V1 = DAG.getBitcast(VT, V1);
10997 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10998}
10999
11000static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
11001 int Delta) {
11002 int Size = (int)Mask.size();
11003 int Split = Size / Delta;
11004 int TruncatedVectorStart = SwappedOps ? Size : 0;
11005
11006 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
11007 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
11008 return false;
11009
11010 // The rest of the mask should not refer to the truncated vector's elements.
11011 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
11012 TruncatedVectorStart + Size))
11013 return false;
11014
11015 return true;
11016}
11017
11018// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11019//
11020// An example is the following:
11021//
11022// t0: ch = EntryToken
11023// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11024// t25: v4i32 = truncate t2
11025// t41: v8i16 = bitcast t25
11026// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11027// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11028// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11029// t18: v2i64 = bitcast t51
11030//
11031// Without avx512vl, this is lowered to:
11032//
11033// vpmovqd %zmm0, %ymm0
11034// vpshufb {{.*#+}} xmm0 =
11035// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
11036//
11037// But when avx512vl is available, one can just use a single vpmovdw
11038// instruction.
11039static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
11040 MVT VT, SDValue V1, SDValue V2,
11041 SelectionDAG &DAG,
11042 const X86Subtarget &Subtarget) {
11043 if (VT != MVT::v16i8 && VT != MVT::v8i16)
11044 return SDValue();
11045
11046 if (Mask.size() != VT.getVectorNumElements())
11047 return SDValue();
11048
11049 bool SwappedOps = false;
11050
11051 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
11052 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
11053 return SDValue();
11054
11055 std::swap(V1, V2);
11056 SwappedOps = true;
11057 }
11058
11059 // Look for:
11060 //
11061 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
11062 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
11063 //
11064 // and similar ones.
11065 if (V1.getOpcode() != ISD::BITCAST)
11066 return SDValue();
11067 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
11068 return SDValue();
11069
11070 SDValue Src = V1.getOperand(0).getOperand(0);
11071 MVT SrcVT = Src.getSimpleValueType();
11072
11073 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
11074 // are only available with avx512vl.
11075 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
11076 return SDValue();
11077
11078 // Down Convert Word to Byte is only available with avx512bw. The case with
11079 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
11080 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11081 !Subtarget.hasBWI())
11082 return SDValue();
11083
11084 // The first half/quarter of the mask should refer to every second/fourth
11085 // element of the vector truncated and bitcasted.
11086 if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
11087 !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
11088 return SDValue();
11089
11090 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
11091}
11092
11093// X86 has dedicated pack instructions that can handle specific truncation
11094// operations: PACKSS and PACKUS.
11095static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11096 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11097 SelectionDAG &DAG,
11098 const X86Subtarget &Subtarget) {
11099 unsigned NumElts = VT.getVectorNumElements();
11100 unsigned BitSize = VT.getScalarSizeInBits();
11101 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
11102 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
11103
11104 auto MatchPACK = [&](SDValue N1, SDValue N2) {
11105 SDValue VV1 = DAG.getBitcast(PackVT, N1);
11106 SDValue VV2 = DAG.getBitcast(PackVT, N2);
11107 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
11108 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
11109 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
11110 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
11111 V1 = VV1;
11112 V2 = VV2;
11113 SrcVT = PackVT;
11114 PackOpcode = X86ISD::PACKUS;
11115 return true;
11116 }
11117 }
11118 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
11119 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
11120 V1 = VV1;
11121 V2 = VV2;
11122 SrcVT = PackVT;
11123 PackOpcode = X86ISD::PACKSS;
11124 return true;
11125 }
11126 return false;
11127 };
11128
11129 // Try binary shuffle.
11130 SmallVector<int, 32> BinaryMask;
11131 createPackShuffleMask(VT, BinaryMask, false);
11132 if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
11133 if (MatchPACK(V1, V2))
11134 return true;
11135
11136 // Try unary shuffle.
11137 SmallVector<int, 32> UnaryMask;
11138 createPackShuffleMask(VT, UnaryMask, true);
11139 if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
11140 if (MatchPACK(V1, V1))
11141 return true;
11142
11143 return false;
11144}
11145
11146static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11147 SDValue V1, SDValue V2, SelectionDAG &DAG,
11148 const X86Subtarget &Subtarget) {
11149 MVT PackVT;
11150 unsigned PackOpcode;
11151 if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11152 Subtarget))
11153 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
11154 DAG.getBitcast(PackVT, V2));
11155
11156 return SDValue();
11157}
11158
11159/// Try to emit a bitmask instruction for a shuffle.
11160///
11161/// This handles cases where we can model a blend exactly as a bitmask due to
11162/// one of the inputs being zeroable.
11163static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11164 SDValue V2, ArrayRef<int> Mask,
11165 const APInt &Zeroable,
11166 const X86Subtarget &Subtarget,
11167 SelectionDAG &DAG) {
11168 MVT MaskVT = VT;
11169 MVT EltVT = VT.getVectorElementType();
11170 SDValue Zero, AllOnes;
11171 // Use f64 if i64 isn't legal.
11172 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11173 EltVT = MVT::f64;
11174 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11175 }
11176
11177 MVT LogicVT = VT;
11178 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11179 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11180 AllOnes = DAG.getConstantFP(
11181 APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
11182 LogicVT =
11183 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11184 } else {
11185 Zero = DAG.getConstant(0, DL, EltVT);
11186 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11187 }
11188
11189 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11190 SDValue V;
11191 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11192 if (Zeroable[i])
11193 continue;
11194 if (Mask[i] % Size != i)
11195 return SDValue(); // Not a blend.
11196 if (!V)
11197 V = Mask[i] < Size ? V1 : V2;
11198 else if (V != (Mask[i] < Size ? V1 : V2))
11199 return SDValue(); // Can only let one input through the mask.
11200
11201 VMaskOps[i] = AllOnes;
11202 }
11203 if (!V)
11204 return SDValue(); // No non-zeroable elements!
11205
11206 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11207 VMask = DAG.getBitcast(LogicVT, VMask);
11208 V = DAG.getBitcast(LogicVT, V);
11209 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11210 return DAG.getBitcast(VT, And);
11211}
11212
11213/// Try to emit a blend instruction for a shuffle using bit math.
11214///
11215/// This is used as a fallback approach when first class blend instructions are
11216/// unavailable. Currently it is only suitable for integer vectors, but could
11217/// be generalized for floating point vectors if desirable.
11218static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
11219 SDValue V2, ArrayRef<int> Mask,
11220 SelectionDAG &DAG) {
11221 assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11221, __PRETTY_FUNCTION__))
;
11222 MVT EltVT = VT.getVectorElementType();
11223 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11224 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11225 SmallVector<SDValue, 16> MaskOps;
11226 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11227 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11228 return SDValue(); // Shuffled input!
11229 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11230 }
11231
11232 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11233 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
11234 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
11235 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
11236}
11237
11238static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
11239 SDValue PreservedSrc,
11240 const X86Subtarget &Subtarget,
11241 SelectionDAG &DAG);
11242
11243static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
11244 MutableArrayRef<int> Mask,
11245 const APInt &Zeroable, bool &ForceV1Zero,
11246 bool &ForceV2Zero, uint64_t &BlendMask) {
11247 bool V1IsZeroOrUndef =
11248 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
11249 bool V2IsZeroOrUndef =
11250 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
11251
11252 BlendMask = 0;
11253 ForceV1Zero = false, ForceV2Zero = false;
11254 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11254, __PRETTY_FUNCTION__))
;
11255
11256 // Attempt to generate the binary blend mask. If an input is zero then
11257 // we can use any lane.
11258 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11259 int M = Mask[i];
11260 if (M == SM_SentinelUndef)
11261 continue;
11262 if (M == i)
11263 continue;
11264 if (M == i + Size) {
11265 BlendMask |= 1ull << i;
11266 continue;
11267 }
11268 if (Zeroable[i]) {
11269 if (V1IsZeroOrUndef) {
11270 ForceV1Zero = true;
11271 Mask[i] = i;
11272 continue;
11273 }
11274 if (V2IsZeroOrUndef) {
11275 ForceV2Zero = true;
11276 BlendMask |= 1ull << i;
11277 Mask[i] = i + Size;
11278 continue;
11279 }
11280 }
11281 return false;
11282 }
11283 return true;
11284}
11285
11286static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
11287 int Scale) {
11288 uint64_t ScaledMask = 0;
11289 for (int i = 0; i != Size; ++i)
11290 if (BlendMask & (1ull << i))
11291 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
11292 return ScaledMask;
11293}
11294
11295/// Try to emit a blend instruction for a shuffle.
11296///
11297/// This doesn't do any checks for the availability of instructions for blending
11298/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11299/// be matched in the backend with the type given. What it does check for is
11300/// that the shuffle mask is a blend, or convertible into a blend with zero.
11301static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
11302 SDValue V2, ArrayRef<int> Original,
11303 const APInt &Zeroable,
11304 const X86Subtarget &Subtarget,
11305 SelectionDAG &DAG) {
11306 uint64_t BlendMask = 0;
11307 bool ForceV1Zero = false, ForceV2Zero = false;
11308 SmallVector<int, 64> Mask(Original.begin(), Original.end());
11309 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11310 BlendMask))
11311 return SDValue();
11312
11313 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11314 if (ForceV1Zero)
11315 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11316 if (ForceV2Zero)
11317 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11318
11319 switch (VT.SimpleTy) {
11320 case MVT::v4i64:
11321 case MVT::v8i32:
11322 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11322, __PRETTY_FUNCTION__))
;
11323 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11324 case MVT::v4f64:
11325 case MVT::v8f32:
11326 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11326, __PRETTY_FUNCTION__))
;
11327 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11328 case MVT::v2f64:
11329 case MVT::v2i64:
11330 case MVT::v4f32:
11331 case MVT::v4i32:
11332 case MVT::v8i16:
11333 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11333, __PRETTY_FUNCTION__))
;
11334 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11335 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11336 case MVT::v16i16: {
11337 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11337, __PRETTY_FUNCTION__))
;
11338 SmallVector<int, 8> RepeatedMask;
11339 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11340 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11341 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11341, __PRETTY_FUNCTION__))
;
11342 BlendMask = 0;
11343 for (int i = 0; i < 8; ++i)
11344 if (RepeatedMask[i] >= 8)
11345 BlendMask |= 1ull << i;
11346 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11347 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11348 }
11349 // Use PBLENDW for lower/upper lanes and then blend lanes.
11350 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11351 // merge to VSELECT where useful.
11352 uint64_t LoMask = BlendMask & 0xFF;
11353 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11354 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11355 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11356 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11357 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11358 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11359 return DAG.getVectorShuffle(
11360 MVT::v16i16, DL, Lo, Hi,
11361 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11362 }
11363 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11364 }
11365 case MVT::v32i8:
11366 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11366, __PRETTY_FUNCTION__))
;
11367 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11368 case MVT::v16i8: {
11369 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11369, __PRETTY_FUNCTION__))
;
11370
11371 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11372 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11373 Subtarget, DAG))
11374 return Masked;
11375
11376 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11377 MVT IntegerType =
11378 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11379 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11380 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11381 }
11382
11383 // Scale the blend by the number of bytes per element.
11384 int Scale = VT.getScalarSizeInBits() / 8;
11385
11386 // This form of blend is always done on bytes. Compute the byte vector
11387 // type.
11388 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11389
11390 // x86 allows load folding with blendvb from the 2nd source operand. But
11391 // we are still using LLVM select here (see comment below), so that's V1.
11392 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11393 // allow that load-folding possibility.
11394 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11395 ShuffleVectorSDNode::commuteMask(Mask);
11396 std::swap(V1, V2);
11397 }
11398
11399 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11400 // mix of LLVM's code generator and the x86 backend. We tell the code
11401 // generator that boolean values in the elements of an x86 vector register
11402 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11403 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11404 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11405 // of the element (the remaining are ignored) and 0 in that high bit would
11406 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11407 // the LLVM model for boolean values in vector elements gets the relevant
11408 // bit set, it is set backwards and over constrained relative to x86's
11409 // actual model.
11410 SmallVector<SDValue, 32> VSELECTMask;
11411 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11412 for (int j = 0; j < Scale; ++j)
11413 VSELECTMask.push_back(
11414 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
11415 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
11416 MVT::i8));
11417
11418 V1 = DAG.getBitcast(BlendVT, V1);
11419 V2 = DAG.getBitcast(BlendVT, V2);
11420 return DAG.getBitcast(
11421 VT,
11422 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11423 V1, V2));
11424 }
11425 case MVT::v16f32:
11426 case MVT::v8f64:
11427 case MVT::v8i64:
11428 case MVT::v16i32:
11429 case MVT::v32i16:
11430 case MVT::v64i8: {
11431 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11432 bool OptForSize = DAG.shouldOptForSize();
11433 if (!OptForSize) {
11434 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11435 Subtarget, DAG))
11436 return Masked;
11437 }
11438
11439 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11440 // masked move.
11441 MVT IntegerType =
11442 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11443 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11444 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11445 }
11446 default:
11447 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11447)
;
11448 }
11449}
11450
11451/// Try to lower as a blend of elements from two inputs followed by
11452/// a single-input permutation.
11453///
11454/// This matches the pattern where we can blend elements from two inputs and
11455/// then reduce the shuffle to a single-input permutation.
11456static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
11457 SDValue V1, SDValue V2,
11458 ArrayRef<int> Mask,
11459 SelectionDAG &DAG,
11460 bool ImmBlends = false) {
11461 // We build up the blend mask while checking whether a blend is a viable way
11462 // to reduce the shuffle.
11463 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11464 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11465
11466 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11467 if (Mask[i] < 0)
11468 continue;
11469
11470 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11470, __PRETTY_FUNCTION__))
;
11471
11472 if (BlendMask[Mask[i] % Size] < 0)
11473 BlendMask[Mask[i] % Size] = Mask[i];
11474 else if (BlendMask[Mask[i] % Size] != Mask[i])
11475 return SDValue(); // Can't blend in the needed input!
11476
11477 PermuteMask[i] = Mask[i] % Size;
11478 }
11479
11480 // If only immediate blends, then bail if the blend mask can't be widened to
11481 // i16.
11482 unsigned EltSize = VT.getScalarSizeInBits();
11483 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11484 return SDValue();
11485
11486 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11487 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11488}
11489
11490/// Try to lower as an unpack of elements from two inputs followed by
11491/// a single-input permutation.
11492///
11493/// This matches the pattern where we can unpack elements from two inputs and
11494/// then reduce the shuffle to a single-input (wider) permutation.
11495static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
11496 SDValue V1, SDValue V2,
11497 ArrayRef<int> Mask,
11498 SelectionDAG &DAG) {
11499 int NumElts = Mask.size();
11500 int NumLanes = VT.getSizeInBits() / 128;
11501 int NumLaneElts = NumElts / NumLanes;
11502 int NumHalfLaneElts = NumLaneElts / 2;
11503
11504 bool MatchLo = true, MatchHi = true;
11505 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11506
11507 // Determine UNPCKL/UNPCKH type and operand order.
11508 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11509 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
11510 int M = Mask[Lane + Elt];
11511 if (M < 0)
11512 continue;
11513
11514 SDValue &Op = Ops[Elt & 1];
11515 if (M < NumElts && (Op.isUndef() || Op == V1))
11516 Op = V1;
11517 else if (NumElts <= M && (Op.isUndef() || Op == V2))
11518 Op = V2;
11519 else
11520 return SDValue();
11521
11522 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11523 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
11524 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
11525 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
11526 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
11527 if (!MatchLo && !MatchHi)
11528 return SDValue();
11529 }
11530 }
11531 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? static_cast<void> (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11531, __PRETTY_FUNCTION__))
;
11532
11533 // Now check that each pair of elts come from the same unpack pair
11534 // and set the permute mask based on each pair.
11535 // TODO - Investigate cases where we permute individual elements.
11536 SmallVector<int, 32> PermuteMask(NumElts, -1);
11537 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11538 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
11539 int M0 = Mask[Lane + Elt + 0];
11540 int M1 = Mask[Lane + Elt + 1];
11541 if (0 <= M0 && 0 <= M1 &&
11542 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
11543 return SDValue();
11544 if (0 <= M0)
11545 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
11546 if (0 <= M1)
11547 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
11548 }
11549 }
11550
11551 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11552 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11553 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11554}
11555
11556/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11557/// permuting the elements of the result in place.
11558static SDValue lowerShuffleAsByteRotateAndPermute(
11559 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11560 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11561 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11562 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11563 (VT.is512BitVector() && !Subtarget.hasBWI()))
11564 return SDValue();
11565
11566 // We don't currently support lane crossing permutes.
11567 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11568 return SDValue();
11569
11570 int Scale = VT.getScalarSizeInBits() / 8;
11571 int NumLanes = VT.getSizeInBits() / 128;
11572 int NumElts = VT.getVectorNumElements();
11573 int NumEltsPerLane = NumElts / NumLanes;
11574
11575 // Determine range of mask elts.
11576 bool Blend1 = true;
11577 bool Blend2 = true;
11578 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11579 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11580 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11581 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11582 int M = Mask[Lane + Elt];
11583 if (M < 0)
11584 continue;
11585 if (M < NumElts) {
11586 Blend1 &= (M == (Lane + Elt));
11587 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11587, __PRETTY_FUNCTION__))
;
11588 M = M % NumEltsPerLane;
11589 Range1.first = std::min(Range1.first, M);
11590 Range1.second = std::max(Range1.second, M);
11591 } else {
11592 M -= NumElts;
11593 Blend2 &= (M == (Lane + Elt));
11594 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11594, __PRETTY_FUNCTION__))
;
11595 M = M % NumEltsPerLane;
11596 Range2.first = std::min(Range2.first, M);
11597 Range2.second = std::max(Range2.second, M);
11598 }
11599 }
11600 }
11601
11602 // Bail if we don't need both elements.
11603 // TODO - it might be worth doing this for unary shuffles if the permute
11604 // can be widened.
11605 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11606 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11607 return SDValue();
11608
11609 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11610 return SDValue();
11611
11612 // Rotate the 2 ops so we can access both ranges, then permute the result.
11613 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11614 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11615 SDValue Rotate = DAG.getBitcast(
11616 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11617 DAG.getBitcast(ByteVT, Lo),
11618 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11619 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11620 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11621 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11622 int M = Mask[Lane + Elt];
11623 if (M < 0)
11624 continue;
11625 if (M < NumElts)
11626 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11627 else
11628 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11629 }
11630 }
11631 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11632 };
11633
11634 // Check if the ranges are small enough to rotate from either direction.
11635 if (Range2.second < Range1.first)
11636 return RotateAndPermute(V1, V2, Range1.first, 0);
11637 if (Range1.second < Range2.first)
11638 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11639 return SDValue();
11640}
11641
11642/// Generic routine to decompose a shuffle and blend into independent
11643/// blends and permutes.
11644///
11645/// This matches the extremely common pattern for handling combined
11646/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11647/// operations. It will try to pick the best arrangement of shuffles and
11648/// blends.
11649static SDValue lowerShuffleAsDecomposedShuffleBlend(
11650 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11651 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11652 // Shuffle the input elements into the desired positions in V1 and V2 and
11653 // blend them together.
11654 SmallVector<int, 32> V1Mask(Mask.size(), -1);
11655 SmallVector<int, 32> V2Mask(Mask.size(), -1);
11656 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11657 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11658 if (Mask[i] >= 0 && Mask[i] < Size) {
11659 V1Mask[i] = Mask[i];
11660 BlendMask[i] = i;
11661 } else if (Mask[i] >= Size) {
11662 V2Mask[i] = Mask[i] - Size;
11663 BlendMask[i] = i + Size;
11664 }
11665
11666 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11667 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11668 // the shuffle may be able to fold with a load or other benefit. However, when
11669 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11670 // pre-shuffle first is a better strategy.
11671 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11672 // Only prefer immediate blends to unpack/rotate.
11673 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11674 DAG, true))
11675 return BlendPerm;
11676 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
11677 DAG))
11678 return UnpackPerm;
11679 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
11680 DL, VT, V1, V2, Mask, Subtarget, DAG))
11681 return RotatePerm;
11682 // Unpack/rotate failed - try again with variable blends.
11683 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11684 DAG))
11685 return BlendPerm;
11686 }
11687
11688 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11689 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11690 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11691}
11692
11693/// Try to lower a vector shuffle as a bit rotation.
11694///
11695/// Look for a repeated rotation pattern in each sub group.
11696/// Returns a ISD::ROTL element rotation amount or -1 if failed.
11697static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
11698 int NumElts = Mask.size();
11699 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11699, __PRETTY_FUNCTION__))
;
11700
11701 int RotateAmt = -1;
11702 for (int i = 0; i != NumElts; i += NumSubElts) {
11703 for (int j = 0; j != NumSubElts; ++j) {
11704 int M = Mask[i + j];
11705 if (M < 0)
11706 continue;
11707 if (!isInRange(M, i, i + NumSubElts))
11708 return -1;
11709 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
11710 if (0 <= RotateAmt && Offset != RotateAmt)
11711 return -1;
11712 RotateAmt = Offset;
11713 }
11714 }
11715 return RotateAmt;
11716}
11717
11718static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11719 const X86Subtarget &Subtarget,
11720 ArrayRef<int> Mask) {
11721 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11721, __PRETTY_FUNCTION__))
;
11722 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? static_cast<void> (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11722, __PRETTY_FUNCTION__))
;
11723
11724 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11725 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11726 int MaxSubElts = 64 / EltSizeInBits;
11727 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11728 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11729 if (RotateAmt < 0)
11730 continue;
11731
11732 int NumElts = Mask.size();
11733 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11734 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11735 return RotateAmt * EltSizeInBits;
11736 }
11737
11738 return -1;
11739}
11740
11741/// Lower shuffle using X86ISD::VROTLI rotations.
11742static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
11743 ArrayRef<int> Mask,
11744 const X86Subtarget &Subtarget,
11745 SelectionDAG &DAG) {
11746 // Only XOP + AVX512 targets have bit rotation instructions.
11747 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11748 bool IsLegal =
11749 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11750 if (!IsLegal && Subtarget.hasSSE3())
11751 return SDValue();
11752
11753 MVT RotateVT;
11754 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11755 Subtarget, Mask);
11756 if (RotateAmt < 0)
11757 return SDValue();
11758
11759 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11760 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11761 // widen to vXi16 or more then existing lowering should will be better.
11762 if (!IsLegal) {
11763 if ((RotateAmt % 16) == 0)
11764 return SDValue();
11765 // TODO: Use getTargetVShiftByConstNode.
11766 unsigned ShlAmt = RotateAmt;
11767 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11768 V1 = DAG.getBitcast(RotateVT, V1);
11769 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11770 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11771 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11772 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11773 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11774 return DAG.getBitcast(VT, Rot);
11775 }
11776
11777 SDValue Rot =
11778 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11779 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11780 return DAG.getBitcast(VT, Rot);
11781}
11782
11783/// Try to lower a vector shuffle as a byte rotation.
11784///
11785/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11786static int matchShuffleAsByteRotate(SDValue &V1, SDValue &V2,
11787 ArrayRef<int> Mask) {
11788 int NumElts = Mask.size();
11789
11790 // We need to detect various ways of spelling a rotation:
11791 // [11, 12, 13, 14, 15, 0, 1, 2]
11792 // [-1, 12, 13, 14, -1, -1, 1, -1]
11793 // [-1, -1, -1, -1, -1, -1, 1, 2]
11794 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11795 // [-1, 4, 5, 6, -1, -1, 9, -1]
11796 // [-1, 4, 5, 6, -1, -1, -1, -1]
11797 int Rotation = 0;
11798 SDValue Lo, Hi;
11799 for (int i = 0; i < NumElts; ++i) {
11800 int M = Mask[i];
11801 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11802, __PRETTY_FUNCTION__))
11802 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11802, __PRETTY_FUNCTION__))
;
11803 if (M < 0)
11804 continue;
11805
11806 // Determine where a rotated vector would have started.
11807 int StartIdx = i - (M % NumElts);
11808 if (StartIdx == 0)
11809 // The identity rotation isn't interesting, stop.
11810 return -1;
11811
11812 // If we found the tail of a vector the rotation must be the missing
11813 // front. If we found the head of a vector, it must be how much of the
11814 // head.
11815 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11816
11817 if (Rotation == 0)
11818 Rotation = CandidateRotation;
11819 else if (Rotation != CandidateRotation)
11820 // The rotations don't match, so we can't match this mask.
11821 return -1;
11822
11823 // Compute which value this mask is pointing at.
11824 SDValue MaskV = M < NumElts ? V1 : V2;
11825
11826 // Compute which of the two target values this index should be assigned
11827 // to. This reflects whether the high elements are remaining or the low
11828 // elements are remaining.
11829 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11830
11831 // Either set up this value if we've not encountered it before, or check
11832 // that it remains consistent.
11833 if (!TargetV)
11834 TargetV = MaskV;
11835 else if (TargetV != MaskV)
11836 // This may be a rotation, but it pulls from the inputs in some
11837 // unsupported interleaving.
11838 return -1;
11839 }
11840
11841 // Check that we successfully analyzed the mask, and normalize the results.
11842 assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11842, __PRETTY_FUNCTION__))
;
11843 assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11843, __PRETTY_FUNCTION__))
;
11844 if (!Lo)
11845 Lo = Hi;
11846 else if (!Hi)
11847 Hi = Lo;
11848
11849 V1 = Lo;
11850 V2 = Hi;
11851
11852 return Rotation;
11853}
11854
11855/// Try to lower a vector shuffle as a byte rotation.
11856///
11857/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11858/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11859/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11860/// try to generically lower a vector shuffle through such an pattern. It
11861/// does not check for the profitability of lowering either as PALIGNR or
11862/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11863/// This matches shuffle vectors that look like:
11864///
11865/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11866///
11867/// Essentially it concatenates V1 and V2, shifts right by some number of
11868/// elements, and takes the low elements as the result. Note that while this is
11869/// specified as a *right shift* because x86 is little-endian, it is a *left
11870/// rotate* of the vector lanes.
11871static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
11872 ArrayRef<int> Mask) {
11873 // Don't accept any shuffles with zero elements.
11874 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
11875 return -1;
11876
11877 // PALIGNR works on 128-bit lanes.
11878 SmallVector<int, 16> RepeatedMask;
11879 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11880 return -1;
11881
11882 int Rotation = matchShuffleAsByteRotate(V1, V2, RepeatedMask);
11883 if (Rotation <= 0)
11884 return -1;
11885
11886 // PALIGNR rotates bytes, so we need to scale the
11887 // rotation based on how many bytes are in the vector lane.
11888 int NumElts = RepeatedMask.size();
11889 int Scale = 16 / NumElts;
11890 return Rotation * Scale;
11891}
11892
11893static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
11894 SDValue V2, ArrayRef<int> Mask,
11895 const X86Subtarget &Subtarget,
11896 SelectionDAG &DAG) {
11897 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11897, __PRETTY_FUNCTION__))
;
11898
11899 SDValue Lo = V1, Hi = V2;
11900 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11901 if (ByteRotation <= 0)
11902 return SDValue();
11903
11904 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11905 // PSLLDQ/PSRLDQ.
11906 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11907 Lo = DAG.getBitcast(ByteVT, Lo);
11908 Hi = DAG.getBitcast(ByteVT, Hi);
11909
11910 // SSSE3 targets can use the palignr instruction.
11911 if (Subtarget.hasSSSE3()) {
11912 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11913, __PRETTY_FUNCTION__))
11913 "512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11913, __PRETTY_FUNCTION__))
;
11914 return DAG.getBitcast(
11915 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11916 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11917 }
11918
11919 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11920, __PRETTY_FUNCTION__))
11920 "Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11920, __PRETTY_FUNCTION__))
;
11921 assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11922, __PRETTY_FUNCTION__))
11922 "Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11922, __PRETTY_FUNCTION__))
;
11923 assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11924, __PRETTY_FUNCTION__))
11924 "SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11924, __PRETTY_FUNCTION__))
;
11925
11926 // Default SSE2 implementation
11927 int LoByteShift = 16 - ByteRotation;
11928 int HiByteShift = ByteRotation;
11929
11930 SDValue LoShift =
11931 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11932 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11933 SDValue HiShift =
11934 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11935 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11936 return DAG.getBitcast(VT,
11937 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11938}
11939
11940/// Try to lower a vector shuffle as a dword/qword rotation.
11941///
11942/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11943/// rotation of the concatenation of two vectors; This routine will
11944/// try to generically lower a vector shuffle through such an pattern.
11945///
11946/// Essentially it concatenates V1 and V2, shifts right by some number of
11947/// elements, and takes the low elements as the result. Note that while this is
11948/// specified as a *right shift* because x86 is little-endian, it is a *left
11949/// rotate* of the vector lanes.
11950static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
11951 SDValue V2, ArrayRef<int> Mask,
11952 const X86Subtarget &Subtarget,
11953 SelectionDAG &DAG) {
11954 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11955, __PRETTY_FUNCTION__))
11955 "Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11955, __PRETTY_FUNCTION__))
;
11956
11957 // 128/256-bit vectors are only supported with VLX.
11958 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11959, __PRETTY_FUNCTION__))
11959 && "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11959, __PRETTY_FUNCTION__))
;
11960
11961 SDValue Lo = V1, Hi = V2;
11962 int Rotation = matchShuffleAsByteRotate(Lo, Hi, Mask);
11963 if (Rotation <= 0)
11964 return SDValue();
11965
11966 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11967 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11968}
11969
11970/// Try to lower a vector shuffle as a byte shift sequence.
11971static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
11972 SDValue V2, ArrayRef<int> Mask,
11973 const APInt &Zeroable,
11974 const X86Subtarget &Subtarget,
11975 SelectionDAG &DAG) {
11976 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11976, __PRETTY_FUNCTION__))
;
11977 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((VT.is128BitVector() && "Only 128-bit vectors supported"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11977, __PRETTY_FUNCTION__))
;
11978
11979 // We need a shuffle that has zeros at one/both ends and a sequential
11980 // shuffle from one source within.
11981 unsigned ZeroLo = Zeroable.countTrailingOnes();
11982 unsigned ZeroHi = Zeroable.countLeadingOnes();
11983 if (!ZeroLo && !ZeroHi)
11984 return SDValue();
11985
11986 unsigned NumElts = Mask.size();
11987 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11988 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11989 return SDValue();
11990
11991 unsigned Scale = VT.getScalarSizeInBits() / 8;
11992 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11993 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11994 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11995 return SDValue();
11996
11997 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11998 Res = DAG.getBitcast(MVT::v16i8, Res);
11999
12000 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12001 // inner sequential set of elements, possibly offset:
12002 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12003 // 01234567 --> 4567zzzz --> zzzzz456
12004 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12005 if (ZeroLo == 0) {
12006 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12007 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12008 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12009 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12010 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12011 } else if (ZeroHi == 0) {
12012 unsigned Shift = Mask[ZeroLo] % NumElts;
12013 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12014 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12015 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12016 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12017 } else if (!Subtarget.hasSSSE3()) {
12018 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12019 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12020 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12021 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12022 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12023 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12024 Shift += Mask[ZeroLo] % NumElts;
12025 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12026 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12027 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12028 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12029 } else
12030 return SDValue();
12031
12032 return DAG.getBitcast(VT, Res);
12033}
12034
12035/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12036///
12037/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12038/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12039/// matches elements from one of the input vectors shuffled to the left or
12040/// right with zeroable elements 'shifted in'. It handles both the strictly
12041/// bit-wise element shifts and the byte shift across an entire 128-bit double
12042/// quad word lane.
12043///
12044/// PSHL : (little-endian) left bit shift.
12045/// [ zz, 0, zz, 2 ]
12046/// [ -1, 4, zz, -1 ]
12047/// PSRL : (little-endian) right bit shift.
12048/// [ 1, zz, 3, zz]
12049/// [ -1, -1, 7, zz]
12050/// PSLLDQ : (little-endian) left byte shift
12051/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12052/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12053/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12054/// PSRLDQ : (little-endian) right byte shift
12055/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12056/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12057/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12058static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12059 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12060 int MaskOffset, const APInt &Zeroable,
12061 const X86Subtarget &Subtarget) {
12062 int Size = Mask.size();
12063 unsigned SizeInBits = Size * ScalarSizeInBits;
12064
12065 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12066 for (int i = 0; i < Size; i += Scale)
12067 for (int j = 0; j < Shift; ++j)
12068 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12069 return false;
12070
12071 return true;
12072 };
12073
12074 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12075 for (int i = 0; i != Size; i += Scale) {
12076 unsigned Pos = Left ? i + Shift : i;
12077 unsigned Low = Left ? i : i + Shift;
12078 unsigned Len = Scale - Shift;
12079 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12080 return -1;
12081 }
12082
12083 int ShiftEltBits = ScalarSizeInBits * Scale;
12084 bool ByteShift = ShiftEltBits > 64;
12085 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12086 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12087 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12088
12089 // Normalize the scale for byte shifts to still produce an i64 element
12090 // type.
12091 Scale = ByteShift ? Scale / 2 : Scale;
12092
12093 // We need to round trip through the appropriate type for the shift.
12094 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12095 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12096 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12097 return (int)ShiftAmt;
12098 };
12099
12100 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12101 // keep doubling the size of the integer elements up to that. We can
12102 // then shift the elements of the integer vector by whole multiples of
12103 // their width within the elements of the larger integer vector. Test each
12104 // multiple to see if we can find a match with the moved element indices
12105 // and that the shifted in elements are all zeroable.
12106 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12107 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12108 for (int Shift = 1; Shift != Scale; ++Shift)
12109 for (bool Left : {true, false})
12110 if (CheckZeros(Shift, Scale, Left)) {
12111 int ShiftAmt = MatchShift(Shift, Scale, Left);
12112 if (0 < ShiftAmt)
12113 return ShiftAmt;
12114 }
12115
12116 // no match
12117 return -1;
12118}
12119
12120static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12121 SDValue V2, ArrayRef<int> Mask,
12122 const APInt &Zeroable,
12123 const X86Subtarget &Subtarget,
12124 SelectionDAG &DAG) {
12125 int Size = Mask.size();
12126 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12126, __PRETTY_FUNCTION__))
;
12127
12128 MVT ShiftVT;
12129 SDValue V = V1;
12130 unsigned Opcode;
12131
12132 // Try to match shuffle against V1 shift.
12133 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12134 Mask, 0, Zeroable, Subtarget);
12135
12136 // If V1 failed, try to match shuffle against V2 shift.
12137 if (ShiftAmt < 0) {
12138 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12139 Mask, Size, Zeroable, Subtarget);
12140 V = V2;
12141 }
12142
12143 if (ShiftAmt < 0)
12144 return SDValue();
12145
12146 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12147, __PRETTY_FUNCTION__))
12147 "Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12147, __PRETTY_FUNCTION__))
;
12148 V = DAG.getBitcast(ShiftVT, V);
12149 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12150 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12151 return DAG.getBitcast(VT, V);
12152}
12153
12154// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12155// Remainder of lower half result is zero and upper half is all undef.
12156static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12157 ArrayRef<int> Mask, uint64_t &BitLen,
12158 uint64_t &BitIdx, const APInt &Zeroable) {
12159 int Size = Mask.size();
12160 int HalfSize = Size / 2;
12161 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12161, __PRETTY_FUNCTION__))
;
12162 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12162, __PRETTY_FUNCTION__))
;
12163
12164 // Upper half must be undefined.
12165 if (!isUndefUpperHalf(Mask))
12166 return false;
12167
12168 // Determine the extraction length from the part of the
12169 // lower half that isn't zeroable.
12170 int Len = HalfSize;
12171 for (; Len > 0; --Len)
12172 if (!Zeroable[Len - 1])
12173 break;
12174 assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast
<void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12174, __PRETTY_FUNCTION__))
;
12175
12176 // Attempt to match first Len sequential elements from the lower half.
12177 SDValue Src;
12178 int Idx = -1;
12179 for (int i = 0; i != Len; ++i) {
12180 int M = Mask[i];
12181 if (M == SM_SentinelUndef)
12182 continue;
12183 SDValue &V = (M < Size ? V1 : V2);
12184 M = M % Size;
12185
12186 // The extracted elements must start at a valid index and all mask
12187 // elements must be in the lower half.
12188 if (i > M || M >= HalfSize)
12189 return false;
12190
12191 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12192 Src = V;
12193 Idx = M - i;
12194 continue;
12195 }
12196 return false;
12197 }
12198
12199 if (!Src || Idx < 0)
12200 return false;
12201
12202 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask"
) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12202, __PRETTY_FUNCTION__))
;
12203 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12204 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12205 V1 = Src;
12206 return true;
12207}
12208
12209// INSERTQ: Extract lowest Len elements from lower half of second source and
12210// insert over first source, starting at Idx.
12211// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12212static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12213 ArrayRef<int> Mask, uint64_t &BitLen,
12214 uint64_t &BitIdx) {
12215 int Size = Mask.size();
12216 int HalfSize = Size / 2;
12217 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12217, __PRETTY_FUNCTION__))
;
12218
12219 // Upper half must be undefined.
12220 if (!isUndefUpperHalf(Mask))
12221 return false;
12222
12223 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12224 SDValue Base;
12225
12226 // Attempt to match first source from mask before insertion point.
12227 if (isUndefInRange(Mask, 0, Idx)) {
12228 /* EMPTY */
12229 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12230 Base = V1;
12231 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12232 Base = V2;
12233 } else {
12234 continue;
12235 }
12236
12237 // Extend the extraction length looking to match both the insertion of
12238 // the second source and the remaining elements of the first.
12239 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12240 SDValue Insert;
12241 int Len = Hi - Idx;
12242
12243 // Match insertion.
12244 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12245 Insert = V1;
12246 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12247 Insert = V2;
12248 } else {
12249 continue;
12250 }
12251
12252 // Match the remaining elements of the lower half.
12253 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12254 /* EMPTY */
12255 } else if ((!Base || (Base == V1)) &&
12256 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12257 Base = V1;
12258 } else if ((!Base || (Base == V2)) &&
12259 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12260 Size + Hi)) {
12261 Base = V2;
12262 } else {
12263 continue;
12264 }
12265
12266 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12267 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12268 V1 = Base;
12269 V2 = Insert;
12270 return true;
12271 }
12272 }
12273
12274 return false;
12275}
12276
12277/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12278static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
12279 SDValue V2, ArrayRef<int> Mask,
12280 const APInt &Zeroable, SelectionDAG &DAG) {
12281 uint64_t BitLen, BitIdx;
12282 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12283 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12284 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12285 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12286
12287 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12288 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12289 V2 ? V2 : DAG.getUNDEF(VT),
12290 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12291 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12292
12293 return SDValue();
12294}
12295
12296/// Lower a vector shuffle as a zero or any extension.
12297///
12298/// Given a specific number of elements, element bit width, and extension
12299/// stride, produce either a zero or any extension based on the available
12300/// features of the subtarget. The extended elements are consecutive and
12301/// begin and can start from an offsetted element index in the input; to
12302/// avoid excess shuffling the offset must either being in the bottom lane
12303/// or at the start of a higher lane. All extended elements must be from
12304/// the same lane.
12305static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
12306 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12307 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12308 assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12308, __PRETTY_FUNCTION__))
;
12309 int EltBits = VT.getScalarSizeInBits();
12310 int NumElements = VT.getVectorNumElements();
12311 int NumEltsPerLane = 128 / EltBits;
12312 int OffsetLane = Offset / NumEltsPerLane;
12313 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12314, __PRETTY_FUNCTION__))
12314 "Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12314, __PRETTY_FUNCTION__))
;
12315 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12315, __PRETTY_FUNCTION__))
;
12316 assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive."
) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12316, __PRETTY_FUNCTION__))
;
12317 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12318, __PRETTY_FUNCTION__))
12318 "Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12318, __PRETTY_FUNCTION__))
;
12319
12320 // Check that an index is in same lane as the base offset.
12321 auto SafeOffset = [&](int Idx) {
12322 return OffsetLane == (Idx / NumEltsPerLane);
12323 };
12324
12325 // Shift along an input so that the offset base moves to the first element.
12326 auto ShuffleOffset = [&](SDValue V) {
12327 if (!Offset)
12328 return V;
12329
12330 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12331 for (int i = 0; i * Scale < NumElements; ++i) {
12332 int SrcIdx = i + Offset;
12333 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12334 }
12335 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12336 };
12337
12338 // Found a valid a/zext mask! Try various lowering strategies based on the
12339 // input type and available ISA extensions.
12340 if (Subtarget.hasSSE41()) {
12341 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12342 // PUNPCK will catch this in a later shuffle match.
12343 if (Offset && Scale == 2 && VT.is128BitVector())
12344 return SDValue();
12345 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12346 NumElements / Scale);
12347 InputV = ShuffleOffset(InputV);
12348 InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
12349 ExtVT, InputV, DAG);
12350 return DAG.getBitcast(VT, InputV);
12351 }
12352
12353 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12353, __PRETTY_FUNCTION__))
;
12354
12355 // For any extends we can cheat for larger element sizes and use shuffle
12356 // instructions that can fold with a load and/or copy.
12357 if (AnyExt && EltBits == 32) {
12358 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12359 -1};
12360 return DAG.getBitcast(
12361 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12362 DAG.getBitcast(MVT::v4i32, InputV),
12363 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12364 }
12365 if (AnyExt && EltBits == 16 && Scale > 2) {
12366 int PSHUFDMask[4] = {Offset / 2, -1,
12367 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12368 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12369 DAG.getBitcast(MVT::v4i32, InputV),
12370 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12371 int PSHUFWMask[4] = {1, -1, -1, -1};
12372 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12373 return DAG.getBitcast(
12374 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12375 DAG.getBitcast(MVT::v8i16, InputV),
12376 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12377 }
12378
12379 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12380 // to 64-bits.
12381 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12382 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12382, __PRETTY_FUNCTION__))
;
12383 assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12383, __PRETTY_FUNCTION__))
;
12384
12385 int LoIdx = Offset * EltBits;
12386 SDValue Lo = DAG.getBitcast(
12387 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12388 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12389 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12390
12391 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12392 return DAG.getBitcast(VT, Lo);
12393
12394 int HiIdx = (Offset + 1) * EltBits;
12395 SDValue Hi = DAG.getBitcast(
12396 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12397 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12398 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12399 return DAG.getBitcast(VT,
12400 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12401 }
12402
12403 // If this would require more than 2 unpack instructions to expand, use
12404 // pshufb when available. We can only use more than 2 unpack instructions
12405 // when zero extending i8 elements which also makes it easier to use pshufb.
12406 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12407 assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12407, __PRETTY_FUNCTION__))
;
12408 SDValue PSHUFBMask[16];
12409 for (int i = 0; i < 16; ++i) {
12410 int Idx = Offset + (i / Scale);
12411 if ((i % Scale == 0 && SafeOffset(Idx))) {
12412 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12413 continue;
12414 }
12415 PSHUFBMask[i] =
12416 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12417 }
12418 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12419 return DAG.getBitcast(
12420 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12421 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12422 }
12423
12424 // If we are extending from an offset, ensure we start on a boundary that
12425 // we can unpack from.
12426 int AlignToUnpack = Offset % (NumElements / Scale);
12427 if (AlignToUnpack) {
12428 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12429 for (int i = AlignToUnpack; i < NumElements; ++i)
12430 ShMask[i - AlignToUnpack] = i;
12431 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12432 Offset -= AlignToUnpack;
12433 }
12434
12435 // Otherwise emit a sequence of unpacks.
12436 do {
12437 unsigned UnpackLoHi = X86ISD::UNPCKL;
12438 if (Offset >= (NumElements / 2)) {
12439 UnpackLoHi = X86ISD::UNPCKH;
12440 Offset -= (NumElements / 2);
12441 }
12442
12443 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12444 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12445 : getZeroVector(InputVT, Subtarget, DAG, DL);
12446 InputV = DAG.getBitcast(InputVT, InputV);
12447 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12448 Scale /= 2;
12449 EltBits *= 2;
12450 NumElements /= 2;
12451 } while (Scale > 1);
12452 return DAG.getBitcast(VT, InputV);
12453}
12454
12455/// Try to lower a vector shuffle as a zero extension on any microarch.
12456///
12457/// This routine will try to do everything in its power to cleverly lower
12458/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12459/// check for the profitability of this lowering, it tries to aggressively
12460/// match this pattern. It will use all of the micro-architectural details it
12461/// can to emit an efficient lowering. It handles both blends with all-zero
12462/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12463/// masking out later).
12464///
12465/// The reason we have dedicated lowering for zext-style shuffles is that they
12466/// are both incredibly common and often quite performance sensitive.
12467static SDValue lowerShuffleAsZeroOrAnyExtend(
12468 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12469 const APInt &Zeroable, const X86Subtarget &Subtarget,
12470 SelectionDAG &DAG) {
12471 int Bits = VT.getSizeInBits();
12472 int NumLanes = Bits / 128;
12473 int NumElements = VT.getVectorNumElements();
12474 int NumEltsPerLane = NumElements / NumLanes;
12475 assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12476, __PRETTY_FUNCTION__))
12476 "Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12476, __PRETTY_FUNCTION__))
;
12477 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12477, __PRETTY_FUNCTION__))
;
12478
12479 // Define a helper function to check a particular ext-scale and lower to it if
12480 // valid.
12481 auto Lower = [&](int Scale) -> SDValue {
12482 SDValue InputV;
12483 bool AnyExt = true;
12484 int Offset = 0;
12485 int Matches = 0;
12486 for (int i = 0; i < NumElements; ++i) {
12487 int M = Mask[i];
12488 if (M < 0)
12489 continue; // Valid anywhere but doesn't tell us anything.
12490 if (i % Scale != 0) {
12491 // Each of the extended elements need to be zeroable.
12492 if (!Zeroable[i])
12493 return SDValue();
12494
12495 // We no longer are in the anyext case.
12496 AnyExt = false;
12497 continue;
12498 }
12499
12500 // Each of the base elements needs to be consecutive indices into the
12501 // same input vector.
12502 SDValue V = M < NumElements ? V1 : V2;
12503 M = M % NumElements;
12504 if (!InputV) {
12505 InputV = V;
12506 Offset = M - (i / Scale);
12507 } else if (InputV != V)
12508 return SDValue(); // Flip-flopping inputs.
12509
12510 // Offset must start in the lowest 128-bit lane or at the start of an
12511 // upper lane.
12512 // FIXME: Is it ever worth allowing a negative base offset?
12513 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12514 (Offset % NumEltsPerLane) == 0))
12515 return SDValue();
12516
12517 // If we are offsetting, all referenced entries must come from the same
12518 // lane.
12519 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12520 return SDValue();
12521
12522 if ((M % NumElements) != (Offset + (i / Scale)))
12523 return SDValue(); // Non-consecutive strided elements.
12524 Matches++;
12525 }
12526
12527 // If we fail to find an input, we have a zero-shuffle which should always
12528 // have already been handled.
12529 // FIXME: Maybe handle this here in case during blending we end up with one?
12530 if (!InputV)
12531 return SDValue();
12532
12533 // If we are offsetting, don't extend if we only match a single input, we
12534 // can always do better by using a basic PSHUF or PUNPCK.
12535 if (Offset != 0 && Matches < 2)
12536 return SDValue();
12537
12538 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12539 InputV, Mask, Subtarget, DAG);
12540 };
12541
12542 // The widest scale possible for extending is to a 64-bit integer.
12543 assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12544, __PRETTY_FUNCTION__))
12544 "The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12544, __PRETTY_FUNCTION__))
;
12545 int NumExtElements = Bits / 64;
12546
12547 // Each iteration, try extending the elements half as much, but into twice as
12548 // many elements.
12549 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12550 assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12551, __PRETTY_FUNCTION__))
12551 "The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12551, __PRETTY_FUNCTION__))
;
12552 if (SDValue V = Lower(NumElements / NumExtElements))
12553 return V;
12554 }
12555
12556 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12557 if (Bits != 128)
12558 return SDValue();
12559
12560 // Returns one of the source operands if the shuffle can be reduced to a
12561 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12562 auto CanZExtLowHalf = [&]() {
12563 for (int i = NumElements / 2; i != NumElements; ++i)
12564 if (!Zeroable[i])
12565 return SDValue();
12566 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12567 return V1;
12568 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12569 return V2;
12570 return SDValue();
12571 };
12572
12573 if (SDValue V = CanZExtLowHalf()) {
12574 V = DAG.getBitcast(MVT::v2i64, V);
12575 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12576 return DAG.getBitcast(VT, V);
12577 }
12578
12579 // No viable ext lowering found.
12580 return SDValue();
12581}
12582
12583/// Try to get a scalar value for a specific element of a vector.
12584///
12585/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12586static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
12587 SelectionDAG &DAG) {
12588 MVT VT = V.getSimpleValueType();
12589 MVT EltVT = VT.getVectorElementType();
12590 V = peekThroughBitcasts(V);
12591
12592 // If the bitcasts shift the element size, we can't extract an equivalent
12593 // element from it.
12594 MVT NewVT = V.getSimpleValueType();
12595 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12596 return SDValue();
12597
12598 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12599 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12600 // Ensure the scalar operand is the same size as the destination.
12601 // FIXME: Add support for scalar truncation where possible.
12602 SDValue S = V.getOperand(Idx);
12603 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12604 return DAG.getBitcast(EltVT, S);
12605 }
12606
12607 return SDValue();
12608}
12609
12610/// Helper to test for a load that can be folded with x86 shuffles.
12611///
12612/// This is particularly important because the set of instructions varies
12613/// significantly based on whether the operand is a load or not.
12614static bool isShuffleFoldableLoad(SDValue V) {
12615 V = peekThroughBitcasts(V);
12616 return ISD::isNON_EXTLoad(V.getNode());
12617}
12618
12619/// Try to lower insertion of a single element into a zero vector.
12620///
12621/// This is a common pattern that we have especially efficient patterns to lower
12622/// across all subtarget feature sets.
12623static SDValue lowerShuffleAsElementInsertion(
12624 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12625 const APInt &Zeroable, const X86Subtarget &Subtarget,
12626 SelectionDAG &DAG) {
12627 MVT ExtVT = VT;
12628 MVT EltVT = VT.getVectorElementType();
12629
12630 int V2Index =
12631 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12632 Mask.begin();
12633 bool IsV1Zeroable = true;
12634 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12635 if (i != V2Index && !Zeroable[i]) {
12636 IsV1Zeroable = false;
12637 break;
12638 }
12639
12640 // Check for a single input from a SCALAR_TO_VECTOR node.
12641 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12642 // all the smarts here sunk into that routine. However, the current
12643 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12644 // vector shuffle lowering is dead.
12645 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12646 DAG);
12647 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12648 // We need to zext the scalar if it is smaller than an i32.
12649 V2S = DAG.getBitcast(EltVT, V2S);
12650 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
12651 // Using zext to expand a narrow element won't work for non-zero
12652 // insertions.
12653 if (!IsV1Zeroable)
12654 return SDValue();
12655
12656 // Zero-extend directly to i32.
12657 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12658 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12659 }
12660 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12661 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12662 EltVT == MVT::i16) {
12663 // Either not inserting from the low element of the input or the input
12664 // element size is too small to use VZEXT_MOVL to clear the high bits.
12665 return SDValue();
12666 }
12667
12668 if (!IsV1Zeroable) {
12669 // If V1 can't be treated as a zero vector we have fewer options to lower
12670 // this. We can't support integer vectors or non-zero targets cheaply, and
12671 // the V1 elements can't be permuted in any way.
12672 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12672, __PRETTY_FUNCTION__))
;
12673 if (!VT.isFloatingPoint() || V2Index != 0)
12674 return SDValue();
12675 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
12676 V1Mask[V2Index] = -1;
12677 if (!isNoopShuffleMask(V1Mask))
12678 return SDValue();
12679 if (!VT.is128BitVector())
12680 return SDValue();
12681
12682 // Otherwise, use MOVSD or MOVSS.
12683 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12684, __PRETTY_FUNCTION__))
12684 "Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12684, __PRETTY_FUNCTION__))
;
12685 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
12686 ExtVT, V1, V2);
12687 }
12688
12689 // This lowering only works for the low element with floating point vectors.
12690 if (VT.isFloatingPoint() && V2Index != 0)
12691 return SDValue();
12692
12693 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12694 if (ExtVT != VT)
12695 V2 = DAG.getBitcast(VT, V2);
12696
12697 if (V2Index != 0) {
12698 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12699 // the desired position. Otherwise it is more efficient to do a vector
12700 // shift left. We know that we can do a vector shift left because all
12701 // the inputs are zero.
12702 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
12703 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12704 V2Shuffle[V2Index] = 0;
12705 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12706 } else {
12707 V2 = DAG.getBitcast(MVT::v16i8, V2);
12708 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12709 DAG.getTargetConstant(
12710 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
12711 V2 = DAG.getBitcast(VT, V2);
12712 }
12713 }
12714 return V2;
12715}
12716
12717/// Try to lower broadcast of a single - truncated - integer element,
12718/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12719///
12720/// This assumes we have AVX2.
12721static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
12722 int BroadcastIdx,
12723 const X86Subtarget &Subtarget,
12724 SelectionDAG &DAG) {
12725 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12726, __PRETTY_FUNCTION__))
12726 "We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12726, __PRETTY_FUNCTION__))
;
12727
12728 EVT EltVT = VT.getVectorElementType();
12729 EVT V0VT = V0.getValueType();
12730
12731 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12731, __PRETTY_FUNCTION__))
;
12732 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12732, __PRETTY_FUNCTION__))
;
12733
12734 EVT V0EltVT = V0VT.getVectorElementType();
12735 if (!V0EltVT.isInteger())
12736 return SDValue();
12737
12738 const unsigned EltSize = EltVT.getSizeInBits();
12739 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12740
12741 // This is only a truncation if the original element type is larger.
12742 if (V0EltSize <= EltSize)
12743 return SDValue();
12744
12745 assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12746, __PRETTY_FUNCTION__))
12746 "Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12746, __PRETTY_FUNCTION__))
;
12747
12748 const unsigned V0Opc = V0.getOpcode();
12749 const unsigned Scale = V0EltSize / EltSize;
12750 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12751
12752 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12753 V0Opc != ISD::BUILD_VECTOR)
12754 return SDValue();
12755
12756 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12757
12758 // If we're extracting non-least-significant bits, shift so we can truncate.
12759 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12760 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12761 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12762 if (const int OffsetIdx = BroadcastIdx % Scale)
12763 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12764 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12765
12766 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12767 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12768}
12769
12770/// Test whether this can be lowered with a single SHUFPS instruction.
12771///
12772/// This is used to disable more specialized lowerings when the shufps lowering
12773/// will happen to be efficient.
12774static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12775 // This routine only handles 128-bit shufps.
12776 assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12776, __PRETTY_FUNCTION__))
;
12777 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12777, __PRETTY_FUNCTION__))
;
12778 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12778, __PRETTY_FUNCTION__))
;
12779 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12779, __PRETTY_FUNCTION__))
;
12780 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12780, __PRETTY_FUNCTION__))
;
12781
12782 // To lower with a single SHUFPS we need to have the low half and high half
12783 // each requiring a single input.
12784 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12785 return false;
12786 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12787 return false;
12788
12789 return true;
12790}
12791
12792/// If we are extracting two 128-bit halves of a vector and shuffling the
12793/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12794/// multi-shuffle lowering.
12795static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
12796 SDValue N1, ArrayRef<int> Mask,
12797 SelectionDAG &DAG) {
12798 EVT VT = N0.getValueType();
12799 assert((VT.is128BitVector() &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12801, __PRETTY_FUNCTION__))
12800 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12801, __PRETTY_FUNCTION__))
12801 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12801, __PRETTY_FUNCTION__))
;
12802
12803 // Check that both sources are extracts of the same source vector.
12804 if (!N0.hasOneUse() || !N1.hasOneUse() ||
12805 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12806 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12807 N0.getOperand(0) != N1.getOperand(0))
12808 return SDValue();
12809
12810 SDValue WideVec = N0.getOperand(0);
12811 EVT WideVT = WideVec.getValueType();
12812 if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
12813 !isa<ConstantSDNode>(N1.getOperand(1)))
12814 return SDValue();
12815
12816 // Match extracts of each half of the wide source vector. Commute the shuffle
12817 // if the extract of the low half is N1.
12818 unsigned NumElts = VT.getVectorNumElements();
12819 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
12820 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12821 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12822 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12823 ShuffleVectorSDNode::commuteMask(NewMask);
12824 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12825 return SDValue();
12826
12827 // Final bailout: if the mask is simple, we are better off using an extract
12828 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12829 // because that avoids a constant load from memory.
12830 if (NumElts == 4 &&
12831 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
12832 return SDValue();
12833
12834 // Extend the shuffle mask with undef elements.
12835 NewMask.append(NumElts, -1);
12836
12837 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12838 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12839 NewMask);
12840 // This is free: ymm -> xmm.
12841 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12842 DAG.getIntPtrConstant(0, DL));
12843}
12844
12845/// Try to lower broadcast of a single element.
12846///
12847/// For convenience, this code also bundles all of the subtarget feature set
12848/// filtering. While a little annoying to re-dispatch on type here, there isn't
12849/// a convenient way to factor it out.
12850static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
12851 SDValue V2, ArrayRef<int> Mask,
12852 const X86Subtarget &Subtarget,
12853 SelectionDAG &DAG) {
12854 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12855 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
12856 (Subtarget.hasAVX2() && VT.isInteger())))
12857 return SDValue();
12858
12859 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12860 // we can only broadcast from a register with AVX2.
12861 unsigned NumEltBits = VT.getScalarSizeInBits();
12862 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12863 ? X86ISD::MOVDDUP
12864 : X86ISD::VBROADCAST;
12865 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12866
12867 // Check that the mask is a broadcast.
12868 int BroadcastIdx = getSplatIndex(Mask);
12869 if (BroadcastIdx < 0)
12870 return SDValue();
12871 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12873, __PRETTY_FUNCTION__))
12872 "a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12873, __PRETTY_FUNCTION__))
12873 "comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12873, __PRETTY_FUNCTION__))
;
12874
12875 // Go up the chain of (vector) values to find a scalar load that we can
12876 // combine with the broadcast.
12877 // TODO: Combine this logic with findEltLoadSrc() used by
12878 // EltsFromConsecutiveLoads().
12879 int BitOffset = BroadcastIdx * NumEltBits;
12880 SDValue V = V1;
12881 for (;;) {
12882 switch (V.getOpcode()) {
12883 case ISD::BITCAST: {
12884 V = V.getOperand(0);
12885 continue;
12886 }
12887 case ISD::CONCAT_VECTORS: {
12888 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12889 int OpIdx = BitOffset / OpBitWidth;
12890 V = V.getOperand(OpIdx);
12891 BitOffset %= OpBitWidth;
12892 continue;
12893 }
12894 case ISD::EXTRACT_SUBVECTOR: {
12895 auto *ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
12896 if (!ConstantIdx)
12897 break;
12898
12899 // The extraction index adds to the existing offset.
12900 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12901 unsigned Idx = ConstantIdx->getZExtValue();
12902 unsigned BeginOffset = Idx * EltBitWidth;
12903 BitOffset += BeginOffset;
12904 V = V.getOperand(0);
12905 continue;
12906 }
12907 case ISD::INSERT_SUBVECTOR: {
12908 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12909 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
12910 if (!ConstantIdx)
12911 break;
12912
12913 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12914 int Idx = (int)ConstantIdx->getZExtValue();
12915 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12916 int BeginOffset = Idx * EltBitWidth;
12917 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12918 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12919 BitOffset -= BeginOffset;
12920 V = VInner;
12921 } else {
12922 V = VOuter;
12923 }
12924 continue;
12925 }
12926 }
12927 break;
12928 }
12929 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12929, __PRETTY_FUNCTION__))
;
12930 BroadcastIdx = BitOffset / NumEltBits;
12931
12932 // Do we need to bitcast the source to retrieve the original broadcast index?
12933 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12934
12935 // Check if this is a broadcast of a scalar. We special case lowering
12936 // for scalars so that we can more effectively fold with loads.
12937 // If the original value has a larger element type than the shuffle, the
12938 // broadcast element is in essence truncated. Make that explicit to ease
12939 // folding.
12940 if (BitCastSrc && VT.isInteger())
12941 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12942 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12943 return TruncBroadcast;
12944
12945 MVT BroadcastVT = VT;
12946
12947 // Also check the simpler case, where we can directly reuse the scalar.
12948 if (!BitCastSrc &&
12949 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12950 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12951 V = V.getOperand(BroadcastIdx);
12952
12953 // If we can't broadcast from a register, check that the input is a load.
12954 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12955 return SDValue();
12956 } else if (ISD::isNormalLoad(V.getNode()) &&
12957 cast<LoadSDNode>(V)->isSimple()) {
12958 // We do not check for one-use of the vector load because a broadcast load
12959 // is expected to be a win for code size, register pressure, and possibly
12960 // uops even if the original vector load is not eliminated.
12961
12962 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
12963 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
12964 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
12965 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
12966 ? X86ISD::MOVDDUP
12967 : Opcode;
12968 }
12969
12970 // Reduce the vector load and shuffle to a broadcasted scalar load.
12971 LoadSDNode *Ld = cast<LoadSDNode>(V);
12972 SDValue BaseAddr = Ld->getOperand(1);
12973 EVT SVT = BroadcastVT.getScalarType();
12974 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12975 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12975, __PRETTY_FUNCTION__))
;
12976 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
12977 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12978 DAG.getMachineFunction().getMachineMemOperand(
12979 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12980 DAG.makeEquivalentMemoryOrdering(Ld, V);
12981 } else if (!BroadcastFromReg) {
12982 // We can't broadcast from a vector register.
12983 return SDValue();
12984 } else if (BitOffset != 0) {
12985 // We can only broadcast from the zero-element of a vector register,
12986 // but it can be advantageous to broadcast from the zero-element of a
12987 // subvector.
12988 if (!VT.is256BitVector() && !VT.is512BitVector())
12989 return SDValue();
12990
12991 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12992 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12993 return SDValue();
12994
12995 // Only broadcast the zero-element of a 128-bit subvector.
12996 if ((BitOffset % 128) != 0)
12997 return SDValue();
12998
12999 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13000, __PRETTY_FUNCTION__))
13000 "Unexpected bit-offset")(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13000, __PRETTY_FUNCTION__))
;
13001 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13002, __PRETTY_FUNCTION__))
13002 "Unexpected vector size")(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13002, __PRETTY_FUNCTION__))
;
13003 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13004 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13005 }
13006
13007 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
13008 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
13009 DAG.getBitcast(MVT::f64, V));
13010
13011 // Bitcast back to the same scalar type as BroadcastVT.
13012 if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
13013 assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&((NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"
) ? static_cast<void> (0) : __assert_fail ("NumEltBits == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13014, __PRETTY_FUNCTION__))
13014 "Unexpected vector element size")((NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"
) ? static_cast<void> (0) : __assert_fail ("NumEltBits == BroadcastVT.getScalarSizeInBits() && \"Unexpected vector element size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13014, __PRETTY_FUNCTION__))
;
13015 MVT ExtVT;
13016 if (V.getValueType().isVector()) {
13017 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13018 ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
13019 } else {
13020 ExtVT = BroadcastVT.getScalarType();
13021 }
13022 V = DAG.getBitcast(ExtVT, V);
13023 }
13024
13025 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
13026 if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
13027 V = DAG.getBitcast(MVT::f64, V);
13028 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
13029 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
13030 }
13031
13032 // We only support broadcasting from 128-bit vectors to minimize the
13033 // number of patterns we need to deal with in isel. So extract down to
13034 // 128-bits, removing as many bitcasts as possible.
13035 if (V.getValueSizeInBits() > 128) {
13036 MVT ExtVT = V.getSimpleValueType().getScalarType();
13037 ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
13038 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13039 V = DAG.getBitcast(ExtVT, V);
13040 }
13041
13042 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13043}
13044
13045// Check for whether we can use INSERTPS to perform the shuffle. We only use
13046// INSERTPS when the V1 elements are already in the correct locations
13047// because otherwise we can just always use two SHUFPS instructions which
13048// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13049// perform INSERTPS if a single V1 element is out of place and all V2
13050// elements are zeroable.
13051static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13052 unsigned &InsertPSMask,
13053 const APInt &Zeroable,
13054 ArrayRef<int> Mask, SelectionDAG &DAG) {
13055 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13055, __PRETTY_FUNCTION__))
;
13056 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13056, __PRETTY_FUNCTION__))
;
13057 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13057, __PRETTY_FUNCTION__))
;
13058
13059 // Attempt to match INSERTPS with one element from VA or VB being
13060 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13061 // are updated.
13062 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13063 ArrayRef<int> CandidateMask) {
13064 unsigned ZMask = 0;
13065 int VADstIndex = -1;
13066 int VBDstIndex = -1;
13067 bool VAUsedInPlace = false;
13068
13069 for (int i = 0; i < 4; ++i) {
13070 // Synthesize a zero mask from the zeroable elements (includes undefs).
13071 if (Zeroable[i]) {
13072 ZMask |= 1 << i;
13073 continue;
13074 }
13075
13076 // Flag if we use any VA inputs in place.
13077 if (i == CandidateMask[i]) {
13078 VAUsedInPlace = true;
13079 continue;
13080 }
13081
13082 // We can only insert a single non-zeroable element.
13083 if (VADstIndex >= 0 || VBDstIndex >= 0)
13084 return false;
13085
13086 if (CandidateMask[i] < 4) {
13087 // VA input out of place for insertion.
13088 VADstIndex = i;
13089 } else {
13090 // VB input for insertion.
13091 VBDstIndex = i;
13092 }
13093 }
13094
13095 // Don't bother if we have no (non-zeroable) element for insertion.
13096 if (VADstIndex < 0 && VBDstIndex < 0)
13097 return false;
13098
13099 // Determine element insertion src/dst indices. The src index is from the
13100 // start of the inserted vector, not the start of the concatenated vector.
13101 unsigned VBSrcIndex = 0;
13102 if (VADstIndex >= 0) {
13103 // If we have a VA input out of place, we use VA as the V2 element
13104 // insertion and don't use the original V2 at all.
13105 VBSrcIndex = CandidateMask[VADstIndex];
13106 VBDstIndex = VADstIndex;
13107 VB = VA;
13108 } else {
13109 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13110 }
13111
13112 // If no V1 inputs are used in place, then the result is created only from
13113 // the zero mask and the V2 insertion - so remove V1 dependency.
13114 if (!VAUsedInPlace)
13115 VA = DAG.getUNDEF(MVT::v4f32);
13116
13117 // Update V1, V2 and InsertPSMask accordingly.
13118 V1 = VA;
13119 V2 = VB;
13120
13121 // Insert the V2 element into the desired position.
13122 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13123 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13123, __PRETTY_FUNCTION__))
;
13124 return true;
13125 };
13126
13127 if (matchAsInsertPS(V1, V2, Mask))
13128 return true;
13129
13130 // Commute and try again.
13131 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13132 ShuffleVectorSDNode::commuteMask(CommutedMask);
13133 if (matchAsInsertPS(V2, V1, CommutedMask))
13134 return true;
13135
13136 return false;
13137}
13138
13139static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13140 ArrayRef<int> Mask, const APInt &Zeroable,
13141 SelectionDAG &DAG) {
13142 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13142, __PRETTY_FUNCTION__))
;
13143 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13143, __PRETTY_FUNCTION__))
;
13144
13145 // Attempt to match the insertps pattern.
13146 unsigned InsertPSMask;
13147 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13148 return SDValue();
13149
13150 // Insert the V2 element into the desired position.
13151 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13152 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13153}
13154
13155/// Try to lower a shuffle as a permute of the inputs followed by an
13156/// UNPCK instruction.
13157///
13158/// This specifically targets cases where we end up with alternating between
13159/// the two inputs, and so can permute them into something that feeds a single
13160/// UNPCK instruction. Note that this routine only targets integer vectors
13161/// because for floating point vectors we have a generalized SHUFPS lowering
13162/// strategy that handles everything that doesn't *exactly* match an unpack,
13163/// making this clever lowering unnecessary.
13164static SDValue lowerShuffleAsPermuteAndUnpack(
13165 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13166 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13167 assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13168, __PRETTY_FUNCTION__))
13168 "This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13168, __PRETTY_FUNCTION__))
;
13169 assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13170, __PRETTY_FUNCTION__))
13170 "This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13170, __PRETTY_FUNCTION__))
;
13171 assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13172, __PRETTY_FUNCTION__))
13172 "This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13172, __PRETTY_FUNCTION__))
;
13173 assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid."
) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13173, __PRETTY_FUNCTION__))
;
13174
13175 int Size = Mask.size();
13176
13177 int NumLoInputs =
13178 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13179 int NumHiInputs =
13180 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13181
13182 bool UnpackLo = NumLoInputs >= NumHiInputs;
13183
13184 auto TryUnpack = [&](int ScalarSize, int Scale) {
13185 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13186 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13187
13188 for (int i = 0; i < Size; ++i) {
13189 if (Mask[i] < 0)
13190 continue;
13191
13192 // Each element of the unpack contains Scale elements from this mask.
13193 int UnpackIdx = i / Scale;
13194
13195 // We only handle the case where V1 feeds the first slots of the unpack.
13196 // We rely on canonicalization to ensure this is the case.
13197 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13198 return SDValue();
13199
13200 // Setup the mask for this input. The indexing is tricky as we have to
13201 // handle the unpack stride.
13202 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13203 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13204 Mask[i] % Size;
13205 }
13206
13207 // If we will have to shuffle both inputs to use the unpack, check whether
13208 // we can just unpack first and shuffle the result. If so, skip this unpack.
13209 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13210 !isNoopShuffleMask(V2Mask))
13211 return SDValue();
13212
13213 // Shuffle the inputs into place.
13214 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13215 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13216
13217 // Cast the inputs to the type we will use to unpack them.
13218 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13219 V1 = DAG.getBitcast(UnpackVT, V1);
13220 V2 = DAG.getBitcast(UnpackVT, V2);
13221
13222 // Unpack the inputs and cast the result back to the desired type.
13223 return DAG.getBitcast(
13224 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13225 UnpackVT, V1, V2));
13226 };
13227
13228 // We try each unpack from the largest to the smallest to try and find one
13229 // that fits this mask.
13230 int OrigScalarSize = VT.getScalarSizeInBits();
13231 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13232 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13233 return Unpack;
13234
13235 // If we're shuffling with a zero vector then we're better off not doing
13236 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13237 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13238 ISD::isBuildVectorAllZeros(V2.getNode()))
13239 return SDValue();
13240
13241 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13242 // initial unpack.
13243 if (NumLoInputs == 0 || NumHiInputs == 0) {
13244 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13245, __PRETTY_FUNCTION__))
13245 "We have to have *some* inputs!")(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13245, __PRETTY_FUNCTION__))
;
13246 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13247
13248 // FIXME: We could consider the total complexity of the permute of each
13249 // possible unpacking. Or at the least we should consider how many
13250 // half-crossings are created.
13251 // FIXME: We could consider commuting the unpacks.
13252
13253 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13254 for (int i = 0; i < Size; ++i) {
13255 if (Mask[i] < 0)
13256 continue;
13257
13258 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13258, __PRETTY_FUNCTION__))
;
13259
13260 PermMask[i] =
13261 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13262 }
13263 return DAG.getVectorShuffle(
13264 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
13265 DL, VT, V1, V2),
13266 DAG.getUNDEF(VT), PermMask);
13267 }
13268
13269 return SDValue();
13270}
13271
13272/// Handle lowering of 2-lane 64-bit floating point shuffles.
13273///
13274/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13275/// support for floating point shuffles but not integer shuffles. These
13276/// instructions will incur a domain crossing penalty on some chips though so
13277/// it is better to avoid lowering through this for integer vectors where
13278/// possible.
13279static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13280 const APInt &Zeroable, SDValue V1, SDValue V2,
13281 const X86Subtarget &Subtarget,
13282 SelectionDAG &DAG) {
13283 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13283, __PRETTY_FUNCTION__))
;
13284 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13284, __PRETTY_FUNCTION__))
;
13285 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13285, __PRETTY_FUNCTION__))
;
13286
13287 if (V2.isUndef()) {
13288 // Check for being able to broadcast a single element.
13289 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13290 Mask, Subtarget, DAG))
13291 return Broadcast;
13292
13293 // Straight shuffle of a single input vector. Simulate this by using the
13294 // single input as both of the "inputs" to this instruction..
13295 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13296
13297 if (Subtarget.hasAVX()) {
13298 // If we have AVX, we can use VPERMILPS which will allow folding a load
13299 // into the shuffle.
13300 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13301 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13302 }
13303
13304 return DAG.getNode(
13305 X86ISD::SHUFP, DL, MVT::v2f64,
13306 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13307 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13308 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13309 }
13310 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13310, __PRETTY_FUNCTION__))
;
13311 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13311, __PRETTY_FUNCTION__))
;
13312 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13312, __PRETTY_FUNCTION__))
;
13313 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13313, __PRETTY_FUNCTION__))
;
13314
13315 if (Subtarget.hasAVX2())
13316 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13317 return Extract;
13318
13319 // When loading a scalar and then shuffling it into a vector we can often do
13320 // the insertion cheaply.
13321 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13322 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13323 return Insertion;
13324 // Try inverting the insertion since for v2 masks it is easy to do and we
13325 // can't reliably sort the mask one way or the other.
13326 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13327 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13328 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13329 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13330 return Insertion;
13331
13332 // Try to use one of the special instruction patterns to handle two common
13333 // blend patterns if a zero-blend above didn't work.
13334 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
13335 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
13336 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13337 // We can either use a special instruction to load over the low double or
13338 // to move just the low double.
13339 return DAG.getNode(
13340 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13341 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13342
13343 if (Subtarget.hasSSE41())
13344 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13345 Zeroable, Subtarget, DAG))
13346 return Blend;
13347
13348 // Use dedicated unpack instructions for masks that match their pattern.
13349 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
13350 return V;
13351
13352 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13353 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13354 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13355}
13356
13357/// Handle lowering of 2-lane 64-bit integer shuffles.
13358///
13359/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13360/// the integer unit to minimize domain crossing penalties. However, for blends
13361/// it falls back to the floating point shuffle operation with appropriate bit
13362/// casting.
13363static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13364 const APInt &Zeroable, SDValue V1, SDValue V2,
13365 const X86Subtarget &Subtarget,
13366 SelectionDAG &DAG) {
13367 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13367, __PRETTY_FUNCTION__))
;
13368 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13368, __PRETTY_FUNCTION__))
;
13369 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13369, __PRETTY_FUNCTION__))
;
13370
13371 if (V2.isUndef()) {
13372 // Check for being able to broadcast a single element.
13373 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13374 Mask, Subtarget, DAG))
13375 return Broadcast;
13376
13377 // Straight shuffle of a single input vector. For everything from SSE2
13378 // onward this has a single fast instruction with no scary immediates.
13379 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13380 V1 = DAG.getBitcast(MVT::v4i32, V1);
13381 int WidenedMask[4] = {
13382 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
13383 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
13384 return DAG.getBitcast(
13385 MVT::v2i64,
13386 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13387 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13388 }
13389 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13389, __PRETTY_FUNCTION__))
;
13390 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13390, __PRETTY_FUNCTION__))
;
13391 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13391, __PRETTY_FUNCTION__))
;
13392 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13392, __PRETTY_FUNCTION__))
;
13393
13394 if (Subtarget.hasAVX2())
13395 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13396 return Extract;
13397
13398 // Try to use shift instructions.
13399 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
13400 Zeroable, Subtarget, DAG))
13401 return Shift;
13402
13403 // When loading a scalar and then shuffling it into a vector we can often do
13404 // the insertion cheaply.
13405 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13406 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13407 return Insertion;
13408 // Try inverting the insertion since for v2 masks it is easy to do and we
13409 // can't reliably sort the mask one way or the other.
13410 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13411 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13412 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13413 return Insertion;
13414
13415 // We have different paths for blend lowering, but they all must use the
13416 // *exact* same predicate.
13417 bool IsBlendSupported = Subtarget.hasSSE41();
13418 if (IsBlendSupported)
13419 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13420 Zeroable, Subtarget, DAG))
13421 return Blend;
13422
13423 // Use dedicated unpack instructions for masks that match their pattern.
13424 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
13425 return V;
13426
13427 // Try to use byte rotation instructions.
13428 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13429 if (Subtarget.hasSSSE3()) {
13430 if (Subtarget.hasVLX())
13431 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13432 Subtarget, DAG))
13433 return Rotate;
13434
13435 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13436 Subtarget, DAG))
13437 return Rotate;
13438 }
13439
13440 // If we have direct support for blends, we should lower by decomposing into
13441 // a permute. That will be faster than the domain cross.
13442 if (IsBlendSupported)
13443 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
13444 Subtarget, DAG);
13445
13446 // We implement this with SHUFPD which is pretty lame because it will likely
13447 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13448 // However, all the alternatives are still more cycles and newer chips don't
13449 // have this problem. It would be really nice if x86 had better shuffles here.
13450 V1 = DAG.getBitcast(MVT::v2f64, V1);
13451 V2 = DAG.getBitcast(MVT::v2f64, V2);
13452 return DAG.getBitcast(MVT::v2i64,
13453 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13454}
13455
13456/// Lower a vector shuffle using the SHUFPS instruction.
13457///
13458/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13459/// It makes no assumptions about whether this is the *best* lowering, it simply
13460/// uses it.
13461static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
13462 ArrayRef<int> Mask, SDValue V1,
13463 SDValue V2, SelectionDAG &DAG) {
13464 SDValue LowV = V1, HighV = V2;
13465 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13466 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13467
13468 if (NumV2Elements == 1) {
13469 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13470
13471 // Compute the index adjacent to V2Index and in the same half by toggling
13472 // the low bit.
13473 int V2AdjIndex = V2Index ^ 1;
13474
13475 if (Mask[V2AdjIndex] < 0) {
13476 // Handles all the cases where we have a single V2 element and an undef.
13477 // This will only ever happen in the high lanes because we commute the
13478 // vector otherwise.
13479 if (V2Index < 2)
13480 std::swap(LowV, HighV);
13481 NewMask[V2Index] -= 4;
13482 } else {
13483 // Handle the case where the V2 element ends up adjacent to a V1 element.
13484 // To make this work, blend them together as the first step.
13485 int V1Index = V2AdjIndex;
13486 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13487 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13488 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13489
13490 // Now proceed to reconstruct the final blend as we have the necessary
13491 // high or low half formed.
13492 if (V2Index < 2) {
13493 LowV = V2;
13494 HighV = V1;
13495 } else {
13496 HighV = V2;
13497 }
13498 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13499 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13500 }
13501 } else if (NumV2Elements == 2) {
13502 if (Mask[0] < 4 && Mask[1] < 4) {
13503 // Handle the easy case where we have V1 in the low lanes and V2 in the
13504 // high lanes.
13505 NewMask[2] -= 4;
13506 NewMask[3] -= 4;
13507 } else if (Mask[2] < 4 && Mask[3] < 4) {
13508 // We also handle the reversed case because this utility may get called
13509 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13510 // arrange things in the right direction.
13511 NewMask[0] -= 4;
13512 NewMask[1] -= 4;
13513 HighV = V1;
13514 LowV = V2;
13515 } else {
13516 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13517 // trying to place elements directly, just blend them and set up the final
13518 // shuffle to place them.
13519
13520 // The first two blend mask elements are for V1, the second two are for
13521 // V2.
13522 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13523 Mask[2] < 4 ? Mask[2] : Mask[3],
13524 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13525 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13526 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13527 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13528
13529 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13530 // a blend.
13531 LowV = HighV = V1;
13532 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13533 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13534 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13535 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13536 }
13537 }
13538 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13539 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13540}
13541
13542/// Lower 4-lane 32-bit floating point shuffles.
13543///
13544/// Uses instructions exclusively from the floating point unit to minimize
13545/// domain crossing penalties, as these are sufficient to implement all v4f32
13546/// shuffles.
13547static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13548 const APInt &Zeroable, SDValue V1, SDValue V2,
13549 const X86Subtarget &Subtarget,
13550 SelectionDAG &DAG) {
13551 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13551, __PRETTY_FUNCTION__))
;
13552 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13552, __PRETTY_FUNCTION__))
;
13553 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13553, __PRETTY_FUNCTION__))
;
13554
13555 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13556
13557 if (NumV2Elements == 0) {
13558 // Check for being able to broadcast a single element.
13559 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13560 Mask, Subtarget, DAG))
13561 return Broadcast;
13562
13563 // Use even/odd duplicate instructions for masks that match their pattern.
13564 if (Subtarget.hasSSE3()) {
13565 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13566 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13567 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
13568 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13569 }
13570
13571 if (Subtarget.hasAVX()) {
13572 // If we have AVX, we can use VPERMILPS which will allow folding a load
13573 // into the shuffle.
13574 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13575 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13576 }
13577
13578 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13579 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13580 if (!Subtarget.hasSSE2()) {
13581 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
13582 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13583 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
13584 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13585 }
13586
13587 // Otherwise, use a straight shuffle of a single input vector. We pass the
13588 // input vector to both operands to simulate this with a SHUFPS.
13589 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13590 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13591 }
13592
13593 if (Subtarget.hasAVX2())
13594 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13595 return Extract;
13596
13597 // There are special ways we can lower some single-element blends. However, we
13598 // have custom ways we can lower more complex single-element blends below that
13599 // we defer to if both this and BLENDPS fail to match, so restrict this to
13600 // when the V2 input is targeting element 0 of the mask -- that is the fast
13601 // case here.
13602 if (NumV2Elements == 1 && Mask[0] >= 4)
13603 if (SDValue V = lowerShuffleAsElementInsertion(
13604 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13605 return V;
13606
13607 if (Subtarget.hasSSE41()) {
13608 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13609 Zeroable, Subtarget, DAG))
13610 return Blend;
13611
13612 // Use INSERTPS if we can complete the shuffle efficiently.
13613 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13614 return V;
13615
13616 if (!isSingleSHUFPSMask(Mask))
13617 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13618 V2, Mask, DAG))
13619 return BlendPerm;
13620 }
13621
13622 // Use low/high mov instructions. These are only valid in SSE1 because
13623 // otherwise they are widened to v2f64 and never get here.
13624 if (!Subtarget.hasSSE2()) {
13625 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
13626 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13627 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
13628 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13629 }
13630
13631 // Use dedicated unpack instructions for masks that match their pattern.
13632 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13633 return V;
13634
13635 // Otherwise fall back to a SHUFPS lowering strategy.
13636 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13637}
13638
13639/// Lower 4-lane i32 vector shuffles.
13640///
13641/// We try to handle these with integer-domain shuffles where we can, but for
13642/// blends we use the floating point domain blend instructions.
13643static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13644 const APInt &Zeroable, SDValue V1, SDValue V2,
13645 const X86Subtarget &Subtarget,
13646 SelectionDAG &DAG) {
13647 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13647, __PRETTY_FUNCTION__))
;
13648 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13648, __PRETTY_FUNCTION__))
;
13649 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13649, __PRETTY_FUNCTION__))
;
13650
13651 // Whenever we can lower this as a zext, that instruction is strictly faster
13652 // than any alternative. It also allows us to fold memory operands into the
13653 // shuffle in many cases.
13654 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13655 Zeroable, Subtarget, DAG))
13656 return ZExt;
13657
13658 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13659
13660 if (NumV2Elements == 0) {
13661 // Try to use broadcast unless the mask only has one non-undef element.
13662 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13663 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13664 Mask, Subtarget, DAG))
13665 return Broadcast;
13666 }
13667
13668 // Straight shuffle of a single input vector. For everything from SSE2
13669 // onward this has a single fast instruction with no scary immediates.
13670 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13671 // but we aren't actually going to use the UNPCK instruction because doing
13672 // so prevents folding a load into this instruction or making a copy.
13673 const int UnpackLoMask[] = {0, 0, 1, 1};
13674 const int UnpackHiMask[] = {2, 2, 3, 3};
13675 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
13676 Mask = UnpackLoMask;
13677 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
13678 Mask = UnpackHiMask;
13679
13680 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13681 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13682 }
13683
13684 if (Subtarget.hasAVX2())
13685 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13686 return Extract;
13687
13688 // Try to use shift instructions.
13689 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
13690 Zeroable, Subtarget, DAG))
13691 return Shift;
13692
13693 // There are special ways we can lower some single-element blends.
13694 if (NumV2Elements == 1)
13695 if (SDValue V = lowerShuffleAsElementInsertion(
13696 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13697 return V;
13698
13699 // We have different paths for blend lowering, but they all must use the
13700 // *exact* same predicate.
13701 bool IsBlendSupported = Subtarget.hasSSE41();
13702 if (IsBlendSupported)
13703 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13704 Zeroable, Subtarget, DAG))
13705 return Blend;
13706
13707 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13708 Zeroable, Subtarget, DAG))
13709 return Masked;
13710
13711 // Use dedicated unpack instructions for masks that match their pattern.
13712 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13713 return V;
13714
13715 // Try to use byte rotation instructions.
13716 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13717 if (Subtarget.hasSSSE3()) {
13718 if (Subtarget.hasVLX())
13719 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13720 Subtarget, DAG))
13721 return Rotate;
13722
13723 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13724 Subtarget, DAG))
13725 return Rotate;
13726 }
13727
13728 // Assume that a single SHUFPS is faster than an alternative sequence of
13729 // multiple instructions (even if the CPU has a domain penalty).
13730 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13731 if (!isSingleSHUFPSMask(Mask)) {
13732 // If we have direct support for blends, we should lower by decomposing into
13733 // a permute. That will be faster than the domain cross.
13734 if (IsBlendSupported)
13735 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
13736 Subtarget, DAG);
13737
13738 // Try to lower by permuting the inputs into an unpack instruction.
13739 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13740 Mask, Subtarget, DAG))
13741 return Unpack;
13742 }
13743
13744 // We implement this with SHUFPS because it can blend from two vectors.
13745 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13746 // up the inputs, bypassing domain shift penalties that we would incur if we
13747 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13748 // relevant.
13749 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13750 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13751 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13752 return DAG.getBitcast(MVT::v4i32, ShufPS);
13753}
13754
13755/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13756/// shuffle lowering, and the most complex part.
13757///
13758/// The lowering strategy is to try to form pairs of input lanes which are
13759/// targeted at the same half of the final vector, and then use a dword shuffle
13760/// to place them onto the right half, and finally unpack the paired lanes into
13761/// their final position.
13762///
13763/// The exact breakdown of how to form these dword pairs and align them on the
13764/// correct sides is really tricky. See the comments within the function for
13765/// more of the details.
13766///
13767/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13768/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13769/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13770/// vector, form the analogous 128-bit 8-element Mask.
13771static SDValue lowerV8I16GeneralSingleInputShuffle(
13772 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13773 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13774 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13774, __PRETTY_FUNCTION__))
;
13775 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13776
13777 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13777, __PRETTY_FUNCTION__))
;
13778 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13779 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13780
13781 // Attempt to directly match PSHUFLW or PSHUFHW.
13782 if (isUndefOrInRange(LoMask, 0, 4) &&
13783 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13784 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13785 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13786 }
13787 if (isUndefOrInRange(HiMask, 4, 8) &&
13788 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13789 for (int i = 0; i != 4; ++i)
13790 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13791 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13792 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13793 }
13794
13795 SmallVector<int, 4> LoInputs;
13796 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13797 array_pod_sort(LoInputs.begin(), LoInputs.end());
13798 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13799 SmallVector<int, 4> HiInputs;
13800 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13801 array_pod_sort(HiInputs.begin(), HiInputs.end());
13802 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13803 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13804 int NumHToL = LoInputs.size() - NumLToL;
13805 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13806 int NumHToH = HiInputs.size() - NumLToH;
13807 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13808 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13809 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13810 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13811
13812 // If we are shuffling values from one half - check how many different DWORD
13813 // pairs we need to create. If only 1 or 2 then we can perform this as a
13814 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13815 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13816 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13817 V = DAG.getNode(ShufWOp, DL, VT, V,
13818 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13819 V = DAG.getBitcast(PSHUFDVT, V);
13820 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13821 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13822 return DAG.getBitcast(VT, V);
13823 };
13824
13825 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13826 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13827 SmallVector<std::pair<int, int>, 4> DWordPairs;
13828 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13829
13830 // Collect the different DWORD pairs.
13831 for (int DWord = 0; DWord != 4; ++DWord) {
13832 int M0 = Mask[2 * DWord + 0];
13833 int M1 = Mask[2 * DWord + 1];
13834 M0 = (M0 >= 0 ? M0 % 4 : M0);
13835 M1 = (M1 >= 0 ? M1 % 4 : M1);
13836 if (M0 < 0 && M1 < 0)
13837 continue;
13838
13839 bool Match = false;
13840 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13841 auto &DWordPair = DWordPairs[j];
13842 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13843 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13844 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13845 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13846 PSHUFDMask[DWord] = DOffset + j;
13847 Match = true;
13848 break;
13849 }
13850 }
13851 if (!Match) {
13852 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13853 DWordPairs.push_back(std::make_pair(M0, M1));
13854 }
13855 }
13856
13857 if (DWordPairs.size() <= 2) {
13858 DWordPairs.resize(2, std::make_pair(-1, -1));
13859 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13860 DWordPairs[1].first, DWordPairs[1].second};
13861 if ((NumHToL + NumHToH) == 0)
13862 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13863 if ((NumLToL + NumLToH) == 0)
13864 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13865 }
13866 }
13867
13868 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13869 // such inputs we can swap two of the dwords across the half mark and end up
13870 // with <=2 inputs to each half in each half. Once there, we can fall through
13871 // to the generic code below. For example:
13872 //
13873 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13874 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13875 //
13876 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13877 // and an existing 2-into-2 on the other half. In this case we may have to
13878 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13879 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13880 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13881 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13882 // half than the one we target for fixing) will be fixed when we re-enter this
13883 // path. We will also combine away any sequence of PSHUFD instructions that
13884 // result into a single instruction. Here is an example of the tricky case:
13885 //
13886 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13887 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13888 //
13889 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13890 //
13891 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13892 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13893 //
13894 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13895 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13896 //
13897 // The result is fine to be handled by the generic logic.
13898 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13899 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13900 int AOffset, int BOffset) {
13901 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13902, __PRETTY_FUNCTION__))
13902 "Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13902, __PRETTY_FUNCTION__))
;
13903 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13904, __PRETTY_FUNCTION__))
13904 "Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13904, __PRETTY_FUNCTION__))
;
13905 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13906, __PRETTY_FUNCTION__))
13906 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13906, __PRETTY_FUNCTION__))
;
13907
13908 bool ThreeAInputs = AToAInputs.size() == 3;
13909
13910 // Compute the index of dword with only one word among the three inputs in
13911 // a half by taking the sum of the half with three inputs and subtracting
13912 // the sum of the actual three inputs. The difference is the remaining
13913 // slot.
13914 int ADWord = 0, BDWord = 0;
13915 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13916 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13917 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13918 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13919 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13920 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13921 int TripleNonInputIdx =
13922 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13923 TripleDWord = TripleNonInputIdx / 2;
13924
13925 // We use xor with one to compute the adjacent DWord to whichever one the
13926 // OneInput is in.
13927 OneInputDWord = (OneInput / 2) ^ 1;
13928
13929 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13930 // and BToA inputs. If there is also such a problem with the BToB and AToB
13931 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13932 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13933 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13934 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13935 // Compute how many inputs will be flipped by swapping these DWords. We
13936 // need
13937 // to balance this to ensure we don't form a 3-1 shuffle in the other
13938 // half.
13939 int NumFlippedAToBInputs =
13940 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
13941 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
13942 int NumFlippedBToBInputs =
13943 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
13944 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
13945 if ((NumFlippedAToBInputs == 1 &&
13946 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13947 (NumFlippedBToBInputs == 1 &&
13948 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13949 // We choose whether to fix the A half or B half based on whether that
13950 // half has zero flipped inputs. At zero, we may not be able to fix it
13951 // with that half. We also bias towards fixing the B half because that
13952 // will more commonly be the high half, and we have to bias one way.
13953 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13954 ArrayRef<int> Inputs) {
13955 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13956 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13957 // Determine whether the free index is in the flipped dword or the
13958 // unflipped dword based on where the pinned index is. We use this bit
13959 // in an xor to conditionally select the adjacent dword.
13960 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13961 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13962 if (IsFixIdxInput == IsFixFreeIdxInput)
13963 FixFreeIdx += 1;
13964 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13965 assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13966, __PRETTY_FUNCTION__))
13966 "We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13966, __PRETTY_FUNCTION__))
;
13967 int PSHUFHalfMask[] = {0, 1, 2, 3};
13968 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13969 V = DAG.getNode(
13970 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13971 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13972 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13973
13974 for (int &M : Mask)
13975 if (M >= 0 && M == FixIdx)
13976 M = FixFreeIdx;
13977 else if (M >= 0 && M == FixFreeIdx)
13978 M = FixIdx;
13979 };
13980 if (NumFlippedBToBInputs != 0) {
13981 int BPinnedIdx =
13982 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13983 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13984 } else {
13985 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13985, __PRETTY_FUNCTION__))
;
13986 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13987 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13988 }
13989 }
13990 }
13991
13992 int PSHUFDMask[] = {0, 1, 2, 3};
13993 PSHUFDMask[ADWord] = BDWord;
13994 PSHUFDMask[BDWord] = ADWord;
13995 V = DAG.getBitcast(
13996 VT,
13997 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13998 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13999
14000 // Adjust the mask to match the new locations of A and B.
14001 for (int &M : Mask)
14002 if (M >= 0 && M/2 == ADWord)
14003 M = 2 * BDWord + M % 2;
14004 else if (M >= 0 && M/2 == BDWord)
14005 M = 2 * ADWord + M % 2;
14006
14007 // Recurse back into this routine to re-compute state now that this isn't
14008 // a 3 and 1 problem.
14009 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14010 };
14011 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14012 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14013 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14014 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14015
14016 // At this point there are at most two inputs to the low and high halves from
14017 // each half. That means the inputs can always be grouped into dwords and
14018 // those dwords can then be moved to the correct half with a dword shuffle.
14019 // We use at most one low and one high word shuffle to collect these paired
14020 // inputs into dwords, and finally a dword shuffle to place them.
14021 int PSHUFLMask[4] = {-1, -1, -1, -1};
14022 int PSHUFHMask[4] = {-1, -1, -1, -1};
14023 int PSHUFDMask[4] = {-1, -1, -1, -1};
14024
14025 // First fix the masks for all the inputs that are staying in their
14026 // original halves. This will then dictate the targets of the cross-half
14027 // shuffles.
14028 auto fixInPlaceInputs =
14029 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14030 MutableArrayRef<int> SourceHalfMask,
14031 MutableArrayRef<int> HalfMask, int HalfOffset) {
14032 if (InPlaceInputs.empty())
14033 return;
14034 if (InPlaceInputs.size() == 1) {
14035 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14036 InPlaceInputs[0] - HalfOffset;
14037 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14038 return;
14039 }
14040 if (IncomingInputs.empty()) {
14041 // Just fix all of the in place inputs.
14042 for (int Input : InPlaceInputs) {
14043 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14044 PSHUFDMask[Input / 2] = Input / 2;
14045 }
14046 return;
14047 }
14048
14049 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14049, __PRETTY_FUNCTION__))
;
14050 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14051 InPlaceInputs[0] - HalfOffset;
14052 // Put the second input next to the first so that they are packed into
14053 // a dword. We find the adjacent index by toggling the low bit.
14054 int AdjIndex = InPlaceInputs[0] ^ 1;
14055 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14056 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14057 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14058 };
14059 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14060 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14061
14062 // Now gather the cross-half inputs and place them into a free dword of
14063 // their target half.
14064 // FIXME: This operation could almost certainly be simplified dramatically to
14065 // look more like the 3-1 fixing operation.
14066 auto moveInputsToRightHalf = [&PSHUFDMask](
14067 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14068 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14069 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14070 int DestOffset) {
14071 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14072 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14073 };
14074 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14075 int Word) {
14076 int LowWord = Word & ~1;
14077 int HighWord = Word | 1;
14078 return isWordClobbered(SourceHalfMask, LowWord) ||
14079 isWordClobbered(SourceHalfMask, HighWord);
14080 };
14081
14082 if (IncomingInputs.empty())
14083 return;
14084
14085 if (ExistingInputs.empty()) {
14086 // Map any dwords with inputs from them into the right half.
14087 for (int Input : IncomingInputs) {
14088 // If the source half mask maps over the inputs, turn those into
14089 // swaps and use the swapped lane.
14090 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14091 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14092 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14093 Input - SourceOffset;
14094 // We have to swap the uses in our half mask in one sweep.
14095 for (int &M : HalfMask)
14096 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14097 M = Input;
14098 else if (M == Input)
14099 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14100 } else {
14101 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14103, __PRETTY_FUNCTION__))
14102 Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14103, __PRETTY_FUNCTION__))
14103 "Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14103, __PRETTY_FUNCTION__))
;
14104 }
14105 // Note that this correctly re-maps both when we do a swap and when
14106 // we observe the other side of the swap above. We rely on that to
14107 // avoid swapping the members of the input list directly.
14108 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14109 }
14110
14111 // Map the input's dword into the correct half.
14112 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14113 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14114 else
14115 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14117, __PRETTY_FUNCTION__))
14116 Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14117, __PRETTY_FUNCTION__))
14117 "Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14117, __PRETTY_FUNCTION__))
;
14118 }
14119
14120 // And just directly shift any other-half mask elements to be same-half
14121 // as we will have mirrored the dword containing the element into the
14122 // same position within that half.
14123 for (int &M : HalfMask)
14124 if (M >= SourceOffset && M < SourceOffset + 4) {
14125 M = M - SourceOffset + DestOffset;
14126 assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14126, __PRETTY_FUNCTION__))
;
14127 }
14128 return;
14129 }
14130
14131 // Ensure we have the input in a viable dword of its current half. This
14132 // is particularly tricky because the original position may be clobbered
14133 // by inputs being moved and *staying* in that half.
14134 if (IncomingInputs.size() == 1) {
14135 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14136 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14137 SourceOffset;
14138 SourceHalfMask[InputFixed - SourceOffset] =
14139 IncomingInputs[0] - SourceOffset;
14140 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14141 InputFixed);
14142 IncomingInputs[0] = InputFixed;
14143 }
14144 } else if (IncomingInputs.size() == 2) {
14145 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14146 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14147 // We have two non-adjacent or clobbered inputs we need to extract from
14148 // the source half. To do this, we need to map them into some adjacent
14149 // dword slot in the source mask.
14150 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14151 IncomingInputs[1] - SourceOffset};
14152
14153 // If there is a free slot in the source half mask adjacent to one of
14154 // the inputs, place the other input in it. We use (Index XOR 1) to
14155 // compute an adjacent index.
14156 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14157 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14158 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14159 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14160 InputsFixed[1] = InputsFixed[0] ^ 1;
14161 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14162 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14163 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14164 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14165 InputsFixed[0] = InputsFixed[1] ^ 1;
14166 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14167 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14168 // The two inputs are in the same DWord but it is clobbered and the
14169 // adjacent DWord isn't used at all. Move both inputs to the free
14170 // slot.
14171 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14172 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14173 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14174 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14175 } else {
14176 // The only way we hit this point is if there is no clobbering
14177 // (because there are no off-half inputs to this half) and there is no
14178 // free slot adjacent to one of the inputs. In this case, we have to
14179 // swap an input with a non-input.
14180 for (int i = 0; i < 4; ++i)
14181 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14182, __PRETTY_FUNCTION__))
14182 "We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14182, __PRETTY_FUNCTION__))
;
14183 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14184, __PRETTY_FUNCTION__))
14184 "Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14184, __PRETTY_FUNCTION__))
;
14185
14186 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14187 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14188
14189 // We also have to update the final source mask in this case because
14190 // it may need to undo the above swap.
14191 for (int &M : FinalSourceHalfMask)
14192 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14193 M = InputsFixed[1] + SourceOffset;
14194 else if (M == InputsFixed[1] + SourceOffset)
14195 M = (InputsFixed[0] ^ 1) + SourceOffset;
14196
14197 InputsFixed[1] = InputsFixed[0] ^ 1;
14198 }
14199
14200 // Point everything at the fixed inputs.
14201 for (int &M : HalfMask)
14202 if (M == IncomingInputs[0])
14203 M = InputsFixed[0] + SourceOffset;
14204 else if (M == IncomingInputs[1])
14205 M = InputsFixed[1] + SourceOffset;
14206
14207 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14208 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14209 }
14210 } else {
14211 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14211)
;
14212 }
14213
14214 // Now hoist the DWord down to the right half.
14215 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14216 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ?
static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14216, __PRETTY_FUNCTION__))
;
14217 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14218 for (int &M : HalfMask)
14219 for (int Input : IncomingInputs)
14220 if (M == Input)
14221 M = FreeDWord * 2 + Input % 2;
14222 };
14223 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14224 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14225 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14226 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14227
14228 // Now enact all the shuffles we've computed to move the inputs into their
14229 // target half.
14230 if (!isNoopShuffleMask(PSHUFLMask))
14231 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14232 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14233 if (!isNoopShuffleMask(PSHUFHMask))
14234 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14235 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14236 if (!isNoopShuffleMask(PSHUFDMask))
14237 V = DAG.getBitcast(
14238 VT,
14239 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14240 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14241
14242 // At this point, each half should contain all its inputs, and we can then
14243 // just shuffle them into their final position.
14244 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14245, __PRETTY_FUNCTION__))
14245 "Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14245, __PRETTY_FUNCTION__))
;
14246 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14247, __PRETTY_FUNCTION__))
14247 "Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14247, __PRETTY_FUNCTION__))
;
14248
14249 // Do a half shuffle for the low mask.
14250 if (!isNoopShuffleMask(LoMask))
14251 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14252 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14253
14254 // Do a half shuffle with the high mask after shifting its values down.
14255 for (int &M : HiMask)
14256 if (M >= 0)
14257 M -= 4;
14258 if (!isNoopShuffleMask(HiMask))
14259 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14260 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14261
14262 return V;
14263}
14264
14265/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14266/// blend if only one input is used.
14267static SDValue lowerShuffleAsBlendOfPSHUFBs(
14268 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14269 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14270 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14271, __PRETTY_FUNCTION__))
14271 "Lane crossing shuffle masks not supported")((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14271, __PRETTY_FUNCTION__))
;
14272
14273 int NumBytes = VT.getSizeInBits() / 8;
14274 int Size = Mask.size();
14275 int Scale = NumBytes / Size;
14276
14277 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14278 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14279 V1InUse = false;
14280 V2InUse = false;
14281
14282 for (int i = 0; i < NumBytes; ++i) {
14283 int M = Mask[i / Scale];
14284 if (M < 0)
14285 continue;
14286
14287 const int ZeroMask = 0x80;
14288 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14289 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14290 if (Zeroable[i / Scale])
14291 V1Idx = V2Idx = ZeroMask;
14292
14293 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14294 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14295 V1InUse |= (ZeroMask != V1Idx);
14296 V2InUse |= (ZeroMask != V2Idx);
14297 }
14298
14299 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14300 if (V1InUse)
14301 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14302 DAG.getBuildVector(ShufVT, DL, V1Mask));
14303 if (V2InUse)
14304 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14305 DAG.getBuildVector(ShufVT, DL, V2Mask));
14306
14307 // If we need shuffled inputs from both, blend the two.
14308 SDValue V;
14309 if (V1InUse && V2InUse)
14310 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14311 else
14312 V = V1InUse ? V1 : V2;
14313
14314 // Cast the result back to the correct type.
14315 return DAG.getBitcast(VT, V);
14316}
14317
14318/// Generic lowering of 8-lane i16 shuffles.
14319///
14320/// This handles both single-input shuffles and combined shuffle/blends with
14321/// two inputs. The single input shuffles are immediately delegated to
14322/// a dedicated lowering routine.
14323///
14324/// The blends are lowered in one of three fundamental ways. If there are few
14325/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14326/// of the input is significantly cheaper when lowered as an interleaving of
14327/// the two inputs, try to interleave them. Otherwise, blend the low and high
14328/// halves of the inputs separately (making them have relatively few inputs)
14329/// and then concatenate them.
14330static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14331 const APInt &Zeroable, SDValue V1, SDValue V2,
14332 const X86Subtarget &Subtarget,
14333 SelectionDAG &DAG) {
14334 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14334, __PRETTY_FUNCTION__))
;
14335 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14335, __PRETTY_FUNCTION__))
;
14336 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14336, __PRETTY_FUNCTION__))
;
14337
14338 // Whenever we can lower this as a zext, that instruction is strictly faster
14339 // than any alternative.
14340 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14341 Zeroable, Subtarget, DAG))
14342 return ZExt;
14343
14344 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14345
14346 if (NumV2Inputs == 0) {
14347 // Try to use shift instructions.
14348 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
14349 Zeroable, Subtarget, DAG))
14350 return Shift;
14351
14352 // Check for being able to broadcast a single element.
14353 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14354 Mask, Subtarget, DAG))
14355 return Broadcast;
14356
14357 // Try to use bit rotation instructions.
14358 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14359 Subtarget, DAG))
14360 return Rotate;
14361
14362 // Use dedicated unpack instructions for masks that match their pattern.
14363 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14364 return V;
14365
14366 // Use dedicated pack instructions for masks that match their pattern.
14367 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14368 Subtarget))
14369 return V;
14370
14371 // Try to use byte rotation instructions.
14372 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14373 Subtarget, DAG))
14374 return Rotate;
14375
14376 // Make a copy of the mask so it can be modified.
14377 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
14378 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14379 Subtarget, DAG);
14380 }
14381
14382 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14384, __PRETTY_FUNCTION__))
14383 "All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14384, __PRETTY_FUNCTION__))
14384 "shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14384, __PRETTY_FUNCTION__))
;
14385
14386 // Try to use shift instructions.
14387 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
14388 Zeroable, Subtarget, DAG))
14389 return Shift;
14390
14391 // See if we can use SSE4A Extraction / Insertion.
14392 if (Subtarget.hasSSE4A())
14393 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14394 Zeroable, DAG))
14395 return V;
14396
14397 // There are special ways we can lower some single-element blends.
14398 if (NumV2Inputs == 1)
14399 if (SDValue V = lowerShuffleAsElementInsertion(
14400 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14401 return V;
14402
14403 // We have different paths for blend lowering, but they all must use the
14404 // *exact* same predicate.
14405 bool IsBlendSupported = Subtarget.hasSSE41();
14406 if (IsBlendSupported)
14407 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14408 Zeroable, Subtarget, DAG))
14409 return Blend;
14410
14411 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14412 Zeroable, Subtarget, DAG))
14413 return Masked;
14414
14415 // Use dedicated unpack instructions for masks that match their pattern.
14416 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14417 return V;
14418
14419 // Use dedicated pack instructions for masks that match their pattern.
14420 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14421 Subtarget))
14422 return V;
14423
14424 // Try to use byte rotation instructions.
14425 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14426 Subtarget, DAG))
14427 return Rotate;
14428
14429 if (SDValue BitBlend =
14430 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14431 return BitBlend;
14432
14433 // Try to use byte shift instructions to mask.
14434 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14435 Zeroable, Subtarget, DAG))
14436 return V;
14437
14438 // Try to lower by permuting the inputs into an unpack instruction.
14439 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14440 Mask, Subtarget, DAG))
14441 return Unpack;
14442
14443 // If we can't directly blend but can use PSHUFB, that will be better as it
14444 // can both shuffle and set up the inefficient blend.
14445 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14446 bool V1InUse, V2InUse;
14447 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14448 Zeroable, DAG, V1InUse, V2InUse);
14449 }
14450
14451 // We can always bit-blend if we have to so the fallback strategy is to
14452 // decompose into single-input permutes and blends.
14453 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
14454 Mask, Subtarget, DAG);
14455}
14456
14457/// Check whether a compaction lowering can be done by dropping even
14458/// elements and compute how many times even elements must be dropped.
14459///
14460/// This handles shuffles which take every Nth element where N is a power of
14461/// two. Example shuffle masks:
14462///
14463/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
14464/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
14465/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
14466/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
14467/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
14468/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
14469///
14470/// Any of these lanes can of course be undef.
14471///
14472/// This routine only supports N <= 3.
14473/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
14474/// for larger N.
14475///
14476/// \returns N above, or the number of times even elements must be dropped if
14477/// there is such a number. Otherwise returns zero.
14478static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
14479 bool IsSingleInput) {
14480 // The modulus for the shuffle vector entries is based on whether this is
14481 // a single input or not.
14482 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
14483 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14484, __PRETTY_FUNCTION__))
14484 "We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14484, __PRETTY_FUNCTION__))
;
14485
14486 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
14487
14488 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
14489 // and 2^3 simultaneously. This is because we may have ambiguity with
14490 // partially undef inputs.
14491 bool ViableForN[3] = {true, true, true};
14492
14493 for (int i = 0, e = Mask.size(); i < e; ++i) {
14494 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
14495 // want.
14496 if (Mask[i] < 0)
14497 continue;
14498
14499 bool IsAnyViable = false;
14500 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14501 if (ViableForN[j]) {
14502 uint64_t N = j + 1;
14503
14504 // The shuffle mask must be equal to (i * 2^N) % M.
14505 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
14506 IsAnyViable = true;
14507 else
14508 ViableForN[j] = false;
14509 }
14510 // Early exit if we exhaust the possible powers of two.
14511 if (!IsAnyViable)
14512 break;
14513 }
14514
14515 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14516 if (ViableForN[j])
14517 return j + 1;
14518
14519 // Return 0 as there is no viable power of two.
14520 return 0;
14521}
14522
14523static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
14524 ArrayRef<int> Mask, SDValue V1,
14525 SDValue V2, SelectionDAG &DAG) {
14526 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
14527 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
14528
14529 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
14530 if (V2.isUndef())
14531 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
14532
14533 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
14534}
14535
14536/// Generic lowering of v16i8 shuffles.
14537///
14538/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14539/// detect any complexity reducing interleaving. If that doesn't help, it uses
14540/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14541/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14542/// back together.
14543static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14544 const APInt &Zeroable, SDValue V1, SDValue V2,
14545 const X86Subtarget &Subtarget,
14546 SelectionDAG &DAG) {
14547 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14547, __PRETTY_FUNCTION__))
;
14548 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14548, __PRETTY_FUNCTION__))
;
14549 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14549, __PRETTY_FUNCTION__))
;
14550
14551 // Try to use shift instructions.
14552 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
14553 Zeroable, Subtarget, DAG))
14554 return Shift;
14555
14556 // Try to use byte rotation instructions.
14557 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14558 Subtarget, DAG))
14559 return Rotate;
14560
14561 // Use dedicated pack instructions for masks that match their pattern.
14562 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14563 Subtarget))
14564 return V;
14565
14566 // Try to use a zext lowering.
14567 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14568 Zeroable, Subtarget, DAG))
14569 return ZExt;
14570
14571 // See if we can use SSE4A Extraction / Insertion.
14572 if (Subtarget.hasSSE4A())
14573 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14574 Zeroable, DAG))
14575 return V;
14576
14577 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14578
14579 // For single-input shuffles, there are some nicer lowering tricks we can use.
14580 if (NumV2Elements == 0) {
14581 // Check for being able to broadcast a single element.
14582 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14583 Mask, Subtarget, DAG))
14584 return Broadcast;
14585
14586 // Try to use bit rotation instructions.
14587 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14588 Subtarget, DAG))
14589 return Rotate;
14590
14591 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14592 return V;
14593
14594 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14595 // Notably, this handles splat and partial-splat shuffles more efficiently.
14596 // However, it only makes sense if the pre-duplication shuffle simplifies
14597 // things significantly. Currently, this means we need to be able to
14598 // express the pre-duplication shuffle as an i16 shuffle.
14599 //
14600 // FIXME: We should check for other patterns which can be widened into an
14601 // i16 shuffle as well.
14602 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14603 for (int i = 0; i < 16; i += 2)
14604 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14605 return false;
14606
14607 return true;
14608 };
14609 auto tryToWidenViaDuplication = [&]() -> SDValue {
14610 if (!canWidenViaDuplication(Mask))
14611 return SDValue();
14612 SmallVector<int, 4> LoInputs;
14613 copy_if(Mask, std::back_inserter(LoInputs),
14614 [](int M) { return M >= 0 && M < 8; });
14615 array_pod_sort(LoInputs.begin(), LoInputs.end());
14616 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14617 LoInputs.end());
14618 SmallVector<int, 4> HiInputs;
14619 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14620 array_pod_sort(HiInputs.begin(), HiInputs.end());
14621 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14622 HiInputs.end());
14623
14624 bool TargetLo = LoInputs.size() >= HiInputs.size();
14625 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14626 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14627
14628 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14629 SmallDenseMap<int, int, 8> LaneMap;
14630 for (int I : InPlaceInputs) {
14631 PreDupI16Shuffle[I/2] = I/2;
14632 LaneMap[I] = I;
14633 }
14634 int j = TargetLo ? 0 : 4, je = j + 4;
14635 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14636 // Check if j is already a shuffle of this input. This happens when
14637 // there are two adjacent bytes after we move the low one.
14638 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14639 // If we haven't yet mapped the input, search for a slot into which
14640 // we can map it.
14641 while (j < je && PreDupI16Shuffle[j] >= 0)
14642 ++j;
14643
14644 if (j == je)
14645 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14646 return SDValue();
14647
14648 // Map this input with the i16 shuffle.
14649 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14650 }
14651
14652 // Update the lane map based on the mapping we ended up with.
14653 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14654 }
14655 V1 = DAG.getBitcast(
14656 MVT::v16i8,
14657 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14658 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14659
14660 // Unpack the bytes to form the i16s that will be shuffled into place.
14661 bool EvenInUse = false, OddInUse = false;
14662 for (int i = 0; i < 16; i += 2) {
14663 EvenInUse |= (Mask[i + 0] >= 0);
14664 OddInUse |= (Mask[i + 1] >= 0);
14665 if (EvenInUse && OddInUse)
14666 break;
14667 }
14668 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14669 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14670 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14671
14672 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14673 for (int i = 0; i < 16; ++i)
14674 if (Mask[i] >= 0) {
14675 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14676 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14676, __PRETTY_FUNCTION__))
;
14677 if (PostDupI16Shuffle[i / 2] < 0)
14678 PostDupI16Shuffle[i / 2] = MappedMask;
14679 else
14680 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14681, __PRETTY_FUNCTION__))
14681 "Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14681, __PRETTY_FUNCTION__))
;
14682 }
14683 return DAG.getBitcast(
14684 MVT::v16i8,
14685 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14686 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14687 };
14688 if (SDValue V = tryToWidenViaDuplication())
14689 return V;
14690 }
14691
14692 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14693 Zeroable, Subtarget, DAG))
14694 return Masked;
14695
14696 // Use dedicated unpack instructions for masks that match their pattern.
14697 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14698 return V;
14699
14700 // Try to use byte shift instructions to mask.
14701 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14702 Zeroable, Subtarget, DAG))
14703 return V;
14704
14705 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14706 // with PSHUFB. It is important to do this before we attempt to generate any
14707 // blends but after all of the single-input lowerings. If the single input
14708 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14709 // want to preserve that and we can DAG combine any longer sequences into
14710 // a PSHUFB in the end. But once we start blending from multiple inputs,
14711 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14712 // and there are *very* few patterns that would actually be faster than the
14713 // PSHUFB approach because of its ability to zero lanes.
14714 //
14715 // FIXME: The only exceptions to the above are blends which are exact
14716 // interleavings with direct instructions supporting them. We currently don't
14717 // handle those well here.
14718 if (Subtarget.hasSSSE3()) {
14719 bool V1InUse = false;
14720 bool V2InUse = false;
14721
14722 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
14723 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14724
14725 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14726 // do so. This avoids using them to handle blends-with-zero which is
14727 // important as a single pshufb is significantly faster for that.
14728 if (V1InUse && V2InUse) {
14729 if (Subtarget.hasSSE41())
14730 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14731 Zeroable, Subtarget, DAG))
14732 return Blend;
14733
14734 // We can use an unpack to do the blending rather than an or in some
14735 // cases. Even though the or may be (very minorly) more efficient, we
14736 // preference this lowering because there are common cases where part of
14737 // the complexity of the shuffles goes away when we do the final blend as
14738 // an unpack.
14739 // FIXME: It might be worth trying to detect if the unpack-feeding
14740 // shuffles will both be pshufb, in which case we shouldn't bother with
14741 // this.
14742 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
14743 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14744 return Unpack;
14745
14746 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
14747 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14748 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
14749
14750 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14751 // PALIGNR will be cheaper than the second PSHUFB+OR.
14752 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
14753 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14754 return V;
14755 }
14756
14757 return PSHUFB;
14758 }
14759
14760 // There are special ways we can lower some single-element blends.
14761 if (NumV2Elements == 1)
14762 if (SDValue V = lowerShuffleAsElementInsertion(
14763 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14764 return V;
14765
14766 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14767 return Blend;
14768
14769 // Check whether a compaction lowering can be done. This handles shuffles
14770 // which take every Nth element for some even N. See the helper function for
14771 // details.
14772 //
14773 // We special case these as they can be particularly efficiently handled with
14774 // the PACKUSB instruction on x86 and they show up in common patterns of
14775 // rearranging bytes to truncate wide elements.
14776 bool IsSingleInput = V2.isUndef();
14777 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
14778 // NumEvenDrops is the power of two stride of the elements. Another way of
14779 // thinking about it is that we need to drop the even elements this many
14780 // times to get the original input.
14781
14782 // First we need to zero all the dropped bytes.
14783 assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14784, __PRETTY_FUNCTION__))
14784 "No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14784, __PRETTY_FUNCTION__))
;
14785 SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
14786 for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
14787 ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
14788 SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
14789 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
14790 if (!IsSingleInput)
14791 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
14792
14793 // Now pack things back together.
14794 V1 = DAG.getBitcast(MVT::v8i16, V1);
14795 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
14796 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
14797 for (int i = 1; i < NumEvenDrops; ++i) {
14798 Result = DAG.getBitcast(MVT::v8i16, Result);
14799 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14800 }
14801
14802 return Result;
14803 }
14804
14805 // Handle multi-input cases by blending single-input shuffles.
14806 if (NumV2Elements > 0)
14807 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
14808 Subtarget, DAG);
14809
14810 // The fallback path for single-input shuffles widens this into two v8i16
14811 // vectors with unpacks, shuffles those, and then pulls them back together
14812 // with a pack.
14813 SDValue V = V1;
14814
14815 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14816 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14817 for (int i = 0; i < 16; ++i)
14818 if (Mask[i] >= 0)
14819 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14820
14821 SDValue VLoHalf, VHiHalf;
14822 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14823 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14824 // i16s.
14825 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14826 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14827 // Use a mask to drop the high bytes.
14828 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14829 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14830 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14831
14832 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14833 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14834
14835 // Squash the masks to point directly into VLoHalf.
14836 for (int &M : LoBlendMask)
14837 if (M >= 0)
14838 M /= 2;
14839 for (int &M : HiBlendMask)
14840 if (M >= 0)
14841 M /= 2;
14842 } else {
14843 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14844 // VHiHalf so that we can blend them as i16s.
14845 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14846
14847 VLoHalf = DAG.getBitcast(
14848 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14849 VHiHalf = DAG.getBitcast(
14850 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14851 }
14852
14853 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14854 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14855
14856 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14857}
14858
14859/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14860///
14861/// This routine breaks down the specific type of 128-bit shuffle and
14862/// dispatches to the lowering routines accordingly.
14863static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14864 MVT VT, SDValue V1, SDValue V2,
14865 const APInt &Zeroable,
14866 const X86Subtarget &Subtarget,
14867 SelectionDAG &DAG) {
14868 switch (VT.SimpleTy) {
14869 case MVT::v2i64:
14870 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14871 case MVT::v2f64:
14872 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14873 case MVT::v4i32:
14874 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14875 case MVT::v4f32:
14876 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14877 case MVT::v8i16:
14878 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14879 case MVT::v16i8:
14880 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14881
14882 default:
14883 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14883)
;
14884 }
14885}
14886
14887/// Generic routine to split vector shuffle into half-sized shuffles.
14888///
14889/// This routine just extracts two subvectors, shuffles them independently, and
14890/// then concatenates them back together. This should work effectively with all
14891/// AVX vector shuffle types.
14892static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14893 SDValue V2, ArrayRef<int> Mask,
14894 SelectionDAG &DAG) {
14895 assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14896, __PRETTY_FUNCTION__))
14896 "Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14896, __PRETTY_FUNCTION__))
;
14897 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14897, __PRETTY_FUNCTION__))
;
14898 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14898, __PRETTY_FUNCTION__))
;
14899
14900 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14901 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14902
14903 int NumElements = VT.getVectorNumElements();
14904 int SplitNumElements = NumElements / 2;
14905 MVT ScalarVT = VT.getVectorElementType();
14906 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
14907
14908 // Rather than splitting build-vectors, just build two narrower build
14909 // vectors. This helps shuffling with splats and zeros.
14910 auto SplitVector = [&](SDValue V) {
14911 V = peekThroughBitcasts(V);
14912
14913 MVT OrigVT = V.getSimpleValueType();
14914 int OrigNumElements = OrigVT.getVectorNumElements();
14915 int OrigSplitNumElements = OrigNumElements / 2;
14916 MVT OrigScalarVT = OrigVT.getVectorElementType();
14917 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
14918
14919 SDValue LoV, HiV;
14920
14921 auto *BV = dyn_cast<BuildVectorSDNode>(V);
14922 if (!BV) {
14923 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14924 DAG.getIntPtrConstant(0, DL));
14925 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14926 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
14927 } else {
14928
14929 SmallVector<SDValue, 16> LoOps, HiOps;
14930 for (int i = 0; i < OrigSplitNumElements; ++i) {
14931 LoOps.push_back(BV->getOperand(i));
14932 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
14933 }
14934 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
14935 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
14936 }
14937 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14938 DAG.getBitcast(SplitVT, HiV));
14939 };
14940
14941 SDValue LoV1, HiV1, LoV2, HiV2;
14942 std::tie(LoV1, HiV1) = SplitVector(V1);
14943 std::tie(LoV2, HiV2) = SplitVector(V2);
14944
14945 // Now create two 4-way blends of these half-width vectors.
14946 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14947 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
14948 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14949 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14950 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14951 for (int i = 0; i < SplitNumElements; ++i) {
14952 int M = HalfMask[i];
14953 if (M >= NumElements) {
14954 if (M >= NumElements + SplitNumElements)
14955 UseHiV2 = true;
14956 else
14957 UseLoV2 = true;
14958 V2BlendMask[i] = M - NumElements;
14959 BlendMask[i] = SplitNumElements + i;
14960 } else if (M >= 0) {
14961 if (M >= SplitNumElements)
14962 UseHiV1 = true;
14963 else
14964 UseLoV1 = true;
14965 V1BlendMask[i] = M;
14966 BlendMask[i] = i;
14967 }
14968 }
14969
14970 // Because the lowering happens after all combining takes place, we need to
14971 // manually combine these blend masks as much as possible so that we create
14972 // a minimal number of high-level vector shuffle nodes.
14973
14974 // First try just blending the halves of V1 or V2.
14975 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14976 return DAG.getUNDEF(SplitVT);
14977 if (!UseLoV2 && !UseHiV2)
14978 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14979 if (!UseLoV1 && !UseHiV1)
14980 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14981
14982 SDValue V1Blend, V2Blend;
14983 if (UseLoV1 && UseHiV1) {
14984 V1Blend =
14985 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14986 } else {
14987 // We only use half of V1 so map the usage down into the final blend mask.
14988 V1Blend = UseLoV1 ? LoV1 : HiV1;
14989 for (int i = 0; i < SplitNumElements; ++i)
14990 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14991 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14992 }
14993 if (UseLoV2 && UseHiV2) {
14994 V2Blend =
14995 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14996 } else {
14997 // We only use half of V2 so map the usage down into the final blend mask.
14998 V2Blend = UseLoV2 ? LoV2 : HiV2;
14999 for (int i = 0; i < SplitNumElements; ++i)
15000 if (BlendMask[i] >= SplitNumElements)
15001 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15002 }
15003 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15004 };
15005 SDValue Lo = HalfBlend(LoMask);
15006 SDValue Hi = HalfBlend(HiMask);
15007 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15008}
15009
15010/// Either split a vector in halves or decompose the shuffles and the
15011/// blend.
15012///
15013/// This is provided as a good fallback for many lowerings of non-single-input
15014/// shuffles with more than one 128-bit lane. In those cases, we want to select
15015/// between splitting the shuffle into 128-bit components and stitching those
15016/// back together vs. extracting the single-input shuffles and blending those
15017/// results.
15018static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15019 SDValue V2, ArrayRef<int> Mask,
15020 const X86Subtarget &Subtarget,
15021 SelectionDAG &DAG) {
15022 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15023, __PRETTY_FUNCTION__))
15023 "shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15023, __PRETTY_FUNCTION__))
;
15024 int Size = Mask.size();
15025
15026 // If this can be modeled as a broadcast of two elements followed by a blend,
15027 // prefer that lowering. This is especially important because broadcasts can
15028 // often fold with memory operands.
15029 auto DoBothBroadcast = [&] {
15030 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15031 for (int M : Mask)
15032 if (M >= Size) {
15033 if (V2BroadcastIdx < 0)
15034 V2BroadcastIdx = M - Size;
15035 else if (M - Size != V2BroadcastIdx)
15036 return false;
15037 } else if (M >= 0) {
15038 if (V1BroadcastIdx < 0)
15039 V1BroadcastIdx = M;
15040 else if (M != V1BroadcastIdx)
15041 return false;
15042 }
15043 return true;
15044 };
15045 if (DoBothBroadcast())
15046 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
15047 Subtarget, DAG);
15048
15049 // If the inputs all stem from a single 128-bit lane of each input, then we
15050 // split them rather than blending because the split will decompose to
15051 // unusually few instructions.
15052 int LaneCount = VT.getSizeInBits() / 128;
15053 int LaneSize = Size / LaneCount;
15054 SmallBitVector LaneInputs[2];
15055 LaneInputs[0].resize(LaneCount, false);
15056 LaneInputs[1].resize(LaneCount, false);
15057 for (int i = 0; i < Size; ++i)
15058 if (Mask[i] >= 0)
15059 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15060 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15061 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15062
15063 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
15064 // that the decomposed single-input shuffles don't end up here.
15065 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
15066 DAG);
15067}
15068
15069// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15070// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15071static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15072 SDValue V1, SDValue V2,
15073 ArrayRef<int> Mask,
15074 SelectionDAG &DAG) {
15075 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((VT == MVT::v4f64 && "Only for v4f64 shuffles") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15075, __PRETTY_FUNCTION__))
;
15076
15077 int LHSMask[4] = {-1, -1, -1, -1};
15078 int RHSMask[4] = {-1, -1, -1, -1};
15079 unsigned SHUFPMask = 0;
15080
15081 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15082 // perform the shuffle once the lanes have been shuffled in place.
15083 for (int i = 0; i != 4; ++i) {
15084 int M = Mask[i];
15085 if (M < 0)
15086 continue;
15087 int LaneBase = i & ~1;
15088 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15089 LaneMask[LaneBase + (M & 1)] = M;
15090 SHUFPMask |= (M & 1) << i;
15091 }
15092
15093 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15094 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15095 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15096 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15097}
15098
15099/// Lower a vector shuffle crossing multiple 128-bit lanes as
15100/// a lane permutation followed by a per-lane permutation.
15101///
15102/// This is mainly for cases where we can have non-repeating permutes
15103/// in each lane.
15104///
15105/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15106/// we should investigate merging them.
15107static SDValue lowerShuffleAsLanePermuteAndPermute(
15108 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15109 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15110 int NumElts = VT.getVectorNumElements();
15111 int NumLanes = VT.getSizeInBits() / 128;
15112 int NumEltsPerLane = NumElts / NumLanes;
15113
15114 SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
15115 SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
15116
15117 for (int i = 0; i != NumElts; ++i) {
15118 int M = Mask[i];
15119 if (M < 0)
15120 continue;
15121
15122 // Ensure that each lane comes from a single source lane.
15123 int SrcLane = M / NumEltsPerLane;
15124 int DstLane = i / NumEltsPerLane;
15125 if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
15126 return SDValue();
15127 SrcLaneMask[DstLane] = SrcLane;
15128
15129 PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
15130 }
15131
15132 // Make sure we set all elements of the lane mask, to avoid undef propagation.
15133 SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
15134 for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
15135 int SrcLane = SrcLaneMask[DstLane];
15136 if (0 <= SrcLane)
15137 for (int j = 0; j != NumEltsPerLane; ++j) {
15138 LaneMask[(DstLane * NumEltsPerLane) + j] =
15139 (SrcLane * NumEltsPerLane) + j;
15140 }
15141 }
15142
15143 // If we're only shuffling a single lowest lane and the rest are identity
15144 // then don't bother.
15145 // TODO - isShuffleMaskInputInPlace could be extended to something like this.
15146 int NumIdentityLanes = 0;
15147 bool OnlyShuffleLowestLane = true;
15148 for (int i = 0; i != NumLanes; ++i) {
15149 if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
15150 i * NumEltsPerLane))
15151 NumIdentityLanes++;
15152 else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
15153 OnlyShuffleLowestLane = false;
15154 }
15155 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15156 return SDValue();
15157
15158 SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
15159 return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
15160}
15161
15162/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15163/// source with a lane permutation.
15164///
15165/// This lowering strategy results in four instructions in the worst case for a
15166/// single-input cross lane shuffle which is lower than any other fully general
15167/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15168/// shuffle pattern should be handled prior to trying this lowering.
15169static SDValue lowerShuffleAsLanePermuteAndShuffle(
15170 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15171 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15172 // FIXME: This should probably be generalized for 512-bit vectors as well.
15173 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15173, __PRETTY_FUNCTION__))
;
15174 int Size = Mask.size();
15175 int LaneSize = Size / 2;
15176
15177 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15178 // Only do this if the elements aren't all from the lower lane,
15179 // otherwise we're (probably) better off doing a split.
15180 if (VT == MVT::v4f64 &&
15181 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15182 if (SDValue V =
15183 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
15184 return V;
15185
15186 // If there are only inputs from one 128-bit lane, splitting will in fact be
15187 // less expensive. The flags track whether the given lane contains an element
15188 // that crosses to another lane.
15189 if (!Subtarget.hasAVX2()) {
15190 bool LaneCrossing[2] = {false, false};
15191 for (int i = 0; i < Size; ++i)
15192 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15193 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15194 if (!LaneCrossing[0] || !LaneCrossing[1])
15195 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15196 } else {
15197 bool LaneUsed[2] = {false, false};
15198 for (int i = 0; i < Size; ++i)
15199 if (Mask[i] >= 0)
15200 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15201 if (!LaneUsed[0] || !LaneUsed[1])
15202 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15203 }
15204
15205 // TODO - we could support shuffling V2 in the Flipped input.
15206 assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15207, __PRETTY_FUNCTION__))
15207 "This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15207, __PRETTY_FUNCTION__))
;
15208
15209 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
15210 for (int i = 0; i < Size; ++i) {
15211 int &M = InLaneMask[i];
15212 if (M < 0)
15213 continue;
15214 if (((M % Size) / LaneSize) != (i / LaneSize))
15215 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15216 }
15217 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15218, __PRETTY_FUNCTION__))
15218 "In-lane shuffle mask expected")((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15218, __PRETTY_FUNCTION__))
;
15219
15220 // Flip the lanes, and shuffle the results which should now be in-lane.
15221 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15222 SDValue Flipped = DAG.getBitcast(PVT, V1);
15223 Flipped =
15224 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15225 Flipped = DAG.getBitcast(VT, Flipped);
15226 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15227}
15228
15229/// Handle lowering 2-lane 128-bit shuffles.
15230static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
15231 SDValue V2, ArrayRef<int> Mask,
15232 const APInt &Zeroable,
15233 const X86Subtarget &Subtarget,
15234 SelectionDAG &DAG) {
15235 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15236 if (Subtarget.hasAVX2() && V2.isUndef())
15237 return SDValue();
15238
15239 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15240
15241 SmallVector<int, 4> WidenedMask;
15242 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15243 return SDValue();
15244
15245 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15246 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15247
15248 // Try to use an insert into a zero vector.
15249 if (WidenedMask[0] == 0 && IsHighZero) {
15250 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15251 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15252 DAG.getIntPtrConstant(0, DL));
15253 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15254 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15255 DAG.getIntPtrConstant(0, DL));
15256 }
15257
15258 // TODO: If minimizing size and one of the inputs is a zero vector and the
15259 // the zero vector has only one use, we could use a VPERM2X128 to save the
15260 // instruction bytes needed to explicitly generate the zero vector.
15261
15262 // Blends are faster and handle all the non-lane-crossing cases.
15263 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15264 Subtarget, DAG))
15265 return Blend;
15266
15267 // If either input operand is a zero vector, use VPERM2X128 because its mask
15268 // allows us to replace the zero input with an implicit zero.
15269 if (!IsLowZero && !IsHighZero) {
15270 // Check for patterns which can be matched with a single insert of a 128-bit
15271 // subvector.
15272 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
15273 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
15274
15275 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15276 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15277 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15278 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15279 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15280 OnlyUsesV1 ? V1 : V2,
15281 DAG.getIntPtrConstant(0, DL));
15282 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15283 DAG.getIntPtrConstant(2, DL));
15284 }
15285 }
15286
15287 // Try to use SHUF128 if possible.
15288 if (Subtarget.hasVLX()) {
15289 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15290 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15291 ((WidenedMask[1] % 2) << 1);
15292 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15293 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15294 }
15295 }
15296 }
15297
15298 // Otherwise form a 128-bit permutation. After accounting for undefs,
15299 // convert the 64-bit shuffle mask selection values into 128-bit
15300 // selection bits by dividing the indexes by 2 and shifting into positions
15301 // defined by a vperm2*128 instruction's immediate control byte.
15302
15303 // The immediate permute control byte looks like this:
15304 // [1:0] - select 128 bits from sources for low half of destination
15305 // [2] - ignore
15306 // [3] - zero low half of destination
15307 // [5:4] - select 128 bits from sources for high half of destination
15308 // [6] - ignore
15309 // [7] - zero high half of destination
15310
15311 assert((WidenedMask[0] >= 0 || IsLowZero) &&(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15312, __PRETTY_FUNCTION__))
15312 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15312, __PRETTY_FUNCTION__))
;
15313
15314 unsigned PermMask = 0;
15315 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15316 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15317
15318 // Check the immediate mask and replace unused sources with undef.
15319 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15320 V1 = DAG.getUNDEF(VT);
15321 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15322 V2 = DAG.getUNDEF(VT);
15323
15324 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15325 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15326}
15327
15328/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15329/// shuffling each lane.
15330///
15331/// This attempts to create a repeated lane shuffle where each lane uses one
15332/// or two of the lanes of the inputs. The lanes of the input vectors are
15333/// shuffled in one or two independent shuffles to get the lanes into the
15334/// position needed by the final shuffle.
15335static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
15336 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15337 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15338 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15338, __PRETTY_FUNCTION__))
;
15339
15340 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15341 return SDValue();
15342
15343 int NumElts = Mask.size();
15344 int NumLanes = VT.getSizeInBits() / 128;
15345 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15346 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15347 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15348
15349 // First pass will try to fill in the RepeatMask from lanes that need two
15350 // sources.
15351 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15352 int Srcs[2] = {-1, -1};
15353 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15354 for (int i = 0; i != NumLaneElts; ++i) {
15355 int M = Mask[(Lane * NumLaneElts) + i];
15356 if (M < 0)
15357 continue;
15358 // Determine which of the possible input lanes (NumLanes from each source)
15359 // this element comes from. Assign that as one of the sources for this
15360 // lane. We can assign up to 2 sources for this lane. If we run out
15361 // sources we can't do anything.
15362 int LaneSrc = M / NumLaneElts;
15363 int Src;
15364 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15365 Src = 0;
15366 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15367 Src = 1;
15368 else
15369 return SDValue();
15370
15371 Srcs[Src] = LaneSrc;
15372 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15373 }
15374
15375 // If this lane has two sources, see if it fits with the repeat mask so far.
15376 if (Srcs[1] < 0)
15377 continue;
15378
15379 LaneSrcs[Lane][0] = Srcs[0];
15380 LaneSrcs[Lane][1] = Srcs[1];
15381
15382 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15383 assert(M1.size() == M2.size() && "Unexpected mask size")((M1.size() == M2.size() && "Unexpected mask size") ?
static_cast<void> (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15383, __PRETTY_FUNCTION__))
;
15384 for (int i = 0, e = M1.size(); i != e; ++i)
15385 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15386 return false;
15387 return true;
15388 };
15389
15390 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15391 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((Mask.size() == MergedMask.size() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15391, __PRETTY_FUNCTION__))
;
15392 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15393 int M = Mask[i];
15394 if (M < 0)
15395 continue;
15396 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15397, __PRETTY_FUNCTION__))
15397 "Unexpected mask element")(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15397, __PRETTY_FUNCTION__))
;
15398 MergedMask[i] = M;
15399 }
15400 };
15401
15402 if (MatchMasks(InLaneMask, RepeatMask)) {
15403 // Merge this lane mask into the final repeat mask.
15404 MergeMasks(InLaneMask, RepeatMask);
15405 continue;
15406 }
15407
15408 // Didn't find a match. Swap the operands and try again.
15409 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15410 ShuffleVectorSDNode::commuteMask(InLaneMask);
15411
15412 if (MatchMasks(InLaneMask, RepeatMask)) {
15413 // Merge this lane mask into the final repeat mask.
15414 MergeMasks(InLaneMask, RepeatMask);
15415 continue;
15416 }
15417
15418 // Couldn't find a match with the operands in either order.
15419 return SDValue();
15420 }
15421
15422 // Now handle any lanes with only one source.
15423 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15424 // If this lane has already been processed, skip it.
15425 if (LaneSrcs[Lane][0] >= 0)
15426 continue;
15427
15428 for (int i = 0; i != NumLaneElts; ++i) {
15429 int M = Mask[(Lane * NumLaneElts) + i];
15430 if (M < 0)
15431 continue;
15432
15433 // If RepeatMask isn't defined yet we can define it ourself.
15434 if (RepeatMask[i] < 0)
15435 RepeatMask[i] = M % NumLaneElts;
15436
15437 if (RepeatMask[i] < NumElts) {
15438 if (RepeatMask[i] != M % NumLaneElts)
15439 return SDValue();
15440 LaneSrcs[Lane][0] = M / NumLaneElts;
15441 } else {
15442 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15443 return SDValue();
15444 LaneSrcs[Lane][1] = M / NumLaneElts;
15445 }
15446 }
15447
15448 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15449 return SDValue();
15450 }
15451
15452 SmallVector<int, 16> NewMask(NumElts, -1);
15453 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15454 int Src = LaneSrcs[Lane][0];
15455 for (int i = 0; i != NumLaneElts; ++i) {
15456 int M = -1;
15457 if (Src >= 0)
15458 M = Src * NumLaneElts + i;
15459 NewMask[Lane * NumLaneElts + i] = M;
15460 }
15461 }
15462 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15463 // Ensure we didn't get back the shuffle we started with.
15464 // FIXME: This is a hack to make up for some splat handling code in
15465 // getVectorShuffle.
15466 if (isa<ShuffleVectorSDNode>(NewV1) &&
15467 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15468 return SDValue();
15469
15470 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15471 int Src = LaneSrcs[Lane][1];
15472 for (int i = 0; i != NumLaneElts; ++i) {
15473 int M = -1;
15474 if (Src >= 0)
15475 M = Src * NumLaneElts + i;
15476 NewMask[Lane * NumLaneElts + i] = M;
15477 }
15478 }
15479 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15480 // Ensure we didn't get back the shuffle we started with.
15481 // FIXME: This is a hack to make up for some splat handling code in
15482 // getVectorShuffle.
15483 if (isa<ShuffleVectorSDNode>(NewV2) &&
15484 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15485 return SDValue();
15486
15487 for (int i = 0; i != NumElts; ++i) {
15488 NewMask[i] = RepeatMask[i % NumLaneElts];
15489 if (NewMask[i] < 0)
15490 continue;
15491
15492 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15493 }
15494 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15495}
15496
15497/// If the input shuffle mask results in a vector that is undefined in all upper
15498/// or lower half elements and that mask accesses only 2 halves of the
15499/// shuffle's operands, return true. A mask of half the width with mask indexes
15500/// adjusted to access the extracted halves of the original shuffle operands is
15501/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15502/// lower half of each input operand is accessed.
15503static bool
15504getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
15505 int &HalfIdx1, int &HalfIdx2) {
15506 assert((Mask.size() == HalfMask.size() * 2) &&(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15507, __PRETTY_FUNCTION__))
15507 "Expected input mask to be twice as long as output")(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15507, __PRETTY_FUNCTION__))
;
15508
15509 // Exactly one half of the result must be undef to allow narrowing.
15510 bool UndefLower = isUndefLowerHalf(Mask);
15511 bool UndefUpper = isUndefUpperHalf(Mask);
15512 if (UndefLower == UndefUpper)
15513 return false;
15514
15515 unsigned HalfNumElts = HalfMask.size();
15516 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15517 HalfIdx1 = -1;
15518 HalfIdx2 = -1;
15519 for (unsigned i = 0; i != HalfNumElts; ++i) {
15520 int M = Mask[i + MaskIndexOffset];
15521 if (M < 0) {
15522 HalfMask[i] = M;
15523 continue;
15524 }
15525
15526 // Determine which of the 4 half vectors this element is from.
15527 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15528 int HalfIdx = M / HalfNumElts;
15529
15530 // Determine the element index into its half vector source.
15531 int HalfElt = M % HalfNumElts;
15532
15533 // We can shuffle with up to 2 half vectors, set the new 'half'
15534 // shuffle mask accordingly.
15535 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15536 HalfMask[i] = HalfElt;
15537 HalfIdx1 = HalfIdx;
15538 continue;
15539 }
15540 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15541 HalfMask[i] = HalfElt + HalfNumElts;
15542 HalfIdx2 = HalfIdx;
15543 continue;
15544 }
15545
15546 // Too many half vectors referenced.
15547 return false;
15548 }
15549
15550 return true;
15551}
15552
15553/// Given the output values from getHalfShuffleMask(), create a half width
15554/// shuffle of extracted vectors followed by an insert back to full width.
15555static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
15556 ArrayRef<int> HalfMask, int HalfIdx1,
15557 int HalfIdx2, bool UndefLower,
15558 SelectionDAG &DAG, bool UseConcat = false) {
15559 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((V1.getValueType() == V2.getValueType() && "Different sized vectors?"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15559, __PRETTY_FUNCTION__))
;
15560 assert(V1.getValueType().isSimple() && "Expecting only simple types")((V1.getValueType().isSimple() && "Expecting only simple types"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15560, __PRETTY_FUNCTION__))
;
15561
15562 MVT VT = V1.getSimpleValueType();
15563 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15564 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15565
15566 auto getHalfVector = [&](int HalfIdx) {
15567 if (HalfIdx < 0)
15568 return DAG.getUNDEF(HalfVT);
15569 SDValue V = (HalfIdx < 2 ? V1 : V2);
15570 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15571 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15572 DAG.getIntPtrConstant(HalfIdx, DL));
15573 };
15574
15575 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15576 SDValue Half1 = getHalfVector(HalfIdx1);
15577 SDValue Half2 = getHalfVector(HalfIdx2);
15578 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15579 if (UseConcat) {
15580 SDValue Op0 = V;
15581 SDValue Op1 = DAG.getUNDEF(HalfVT);
15582 if (UndefLower)
15583 std::swap(Op0, Op1);
15584 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15585 }
15586
15587 unsigned Offset = UndefLower ? HalfNumElts : 0;
15588 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15589 DAG.getIntPtrConstant(Offset, DL));
15590}
15591
15592/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15593/// This allows for fast cases such as subvector extraction/insertion
15594/// or shuffling smaller vector types which can lower more efficiently.
15595static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
15596 SDValue V2, ArrayRef<int> Mask,
15597 const X86Subtarget &Subtarget,
15598 SelectionDAG &DAG) {
15599 assert((VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15600, __PRETTY_FUNCTION__))
15600 "Expected 256-bit or 512-bit vector")(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15600, __PRETTY_FUNCTION__))
;
15601
15602 bool UndefLower = isUndefLowerHalf(Mask);
15603 if (!UndefLower && !isUndefUpperHalf(Mask))
15604 return SDValue();
15605
15606 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15607, __PRETTY_FUNCTION__))
15607 "Completely undef shuffle mask should have been simplified already")(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15607, __PRETTY_FUNCTION__))
;
15608
15609 // Upper half is undef and lower half is whole upper subvector.
15610 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15611 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15612 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15613 if (!UndefLower &&
15614 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15615 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15616 DAG.getIntPtrConstant(HalfNumElts, DL));
15617 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15618 DAG.getIntPtrConstant(0, DL));
15619 }
15620
15621 // Lower half is undef and upper half is whole lower subvector.
15622 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15623 if (UndefLower &&
15624 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15625 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15626 DAG.getIntPtrConstant(0, DL));
15627 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15628 DAG.getIntPtrConstant(HalfNumElts, DL));
15629 }
15630
15631 int HalfIdx1, HalfIdx2;
15632 SmallVector<int, 8> HalfMask(HalfNumElts);
15633 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15634 return SDValue();
15635
15636 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"
) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15636, __PRETTY_FUNCTION__))
;
15637
15638 // Only shuffle the halves of the inputs when useful.
15639 unsigned NumLowerHalves =
15640 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15641 unsigned NumUpperHalves =
15642 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15643 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"
) ? static_cast<void> (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15643, __PRETTY_FUNCTION__))
;
15644
15645 // Determine the larger pattern of undef/halves, then decide if it's worth
15646 // splitting the shuffle based on subtarget capabilities and types.
15647 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15648 if (!UndefLower) {
15649 // XXXXuuuu: no insert is needed.
15650 // Always extract lowers when setting lower - these are all free subreg ops.
15651 if (NumUpperHalves == 0)
15652 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15653 UndefLower, DAG);
15654
15655 if (NumUpperHalves == 1) {
15656 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15657 if (Subtarget.hasAVX2()) {
15658 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15659 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15660 !is128BitUnpackShuffleMask(HalfMask) &&
15661 (!isSingleSHUFPSMask(HalfMask) ||
15662 Subtarget.hasFastVariableShuffle()))
15663 return SDValue();
15664 // If this is a unary shuffle (assume that the 2nd operand is
15665 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15666 // are better off extracting the upper half of 1 operand and using a
15667 // narrow shuffle.
15668 if (EltWidth == 64 && V2.isUndef())
15669 return SDValue();
15670 }
15671 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15672 if (Subtarget.hasAVX512() && VT.is512BitVector())
15673 return SDValue();
15674 // Extract + narrow shuffle is better than the wide alternative.
15675 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15676 UndefLower, DAG);
15677 }
15678
15679 // Don't extract both uppers, instead shuffle and then extract.
15680 assert(NumUpperHalves == 2 && "Half vector count went wrong")((NumUpperHalves == 2 && "Half vector count went wrong"
) ? static_cast<void> (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15680, __PRETTY_FUNCTION__))
;
15681 return SDValue();
15682 }
15683
15684 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15685 if (NumUpperHalves == 0) {
15686 // AVX2 has efficient 64-bit element cross-lane shuffles.
15687 // TODO: Refine to account for unary shuffle, splat, and other masks?
15688 if (Subtarget.hasAVX2() && EltWidth == 64)
15689 return SDValue();
15690 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15691 if (Subtarget.hasAVX512() && VT.is512BitVector())
15692 return SDValue();
15693 // Narrow shuffle + insert is better than the wide alternative.
15694 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15695 UndefLower, DAG);
15696 }
15697
15698 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15699 return SDValue();
15700}
15701
15702/// Test whether the specified input (0 or 1) is in-place blended by the
15703/// given mask.
15704///
15705/// This returns true if the elements from a particular input are already in the
15706/// slot required by the given mask and require no permutation.
15707static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
15708 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15708, __PRETTY_FUNCTION__))
;
15709 int Size = Mask.size();
15710 for (int i = 0; i < Size; ++i)
15711 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
15712 return false;
15713
15714 return true;
15715}
15716
15717/// Handle case where shuffle sources are coming from the same 128-bit lane and
15718/// every lane can be represented as the same repeating mask - allowing us to
15719/// shuffle the sources with the repeating shuffle and then permute the result
15720/// to the destination lanes.
15721static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
15722 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15723 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15724 int NumElts = VT.getVectorNumElements();
15725 int NumLanes = VT.getSizeInBits() / 128;
15726 int NumLaneElts = NumElts / NumLanes;
15727
15728 // On AVX2 we may be able to just shuffle the lowest elements and then
15729 // broadcast the result.
15730 if (Subtarget.hasAVX2()) {
15731 for (unsigned BroadcastSize : {16, 32, 64}) {
15732 if (BroadcastSize <= VT.getScalarSizeInBits())
15733 continue;
15734 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15735
15736 // Attempt to match a repeating pattern every NumBroadcastElts,
15737 // accounting for UNDEFs but only references the lowest 128-bit
15738 // lane of the inputs.
15739 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15740 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15741 for (int j = 0; j != NumBroadcastElts; ++j) {
15742 int M = Mask[i + j];
15743 if (M < 0)
15744 continue;
15745 int &R = RepeatMask[j];
15746 if (0 != ((M % NumElts) / NumLaneElts))
15747 return false;
15748 if (0 <= R && R != M)
15749 return false;
15750 R = M;
15751 }
15752 return true;
15753 };
15754
15755 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15756 if (!FindRepeatingBroadcastMask(RepeatMask))
15757 continue;
15758
15759 // Shuffle the (lowest) repeated elements in place for broadcast.
15760 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15761
15762 // Shuffle the actual broadcast.
15763 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15764 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15765 for (int j = 0; j != NumBroadcastElts; ++j)
15766 BroadcastMask[i + j] = j;
15767 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15768 BroadcastMask);
15769 }
15770 }
15771
15772 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15773 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15774 return SDValue();
15775
15776 // Bail if we already have a repeated lane shuffle mask.
15777 SmallVector<int, 8> RepeatedShuffleMask;
15778 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
15779 return SDValue();
15780
15781 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15782 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
15783 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
15784 int NumSubLanes = NumLanes * SubLaneScale;
15785 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15786
15787 // Check that all the sources are coming from the same lane and see if we can
15788 // form a repeating shuffle mask (local to each sub-lane). At the same time,
15789 // determine the source sub-lane for each destination sub-lane.
15790 int TopSrcSubLane = -1;
15791 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15792 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
15793 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
15794 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
15795
15796 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15797 // Extract the sub-lane mask, check that it all comes from the same lane
15798 // and normalize the mask entries to come from the first lane.
15799 int SrcLane = -1;
15800 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15801 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15802 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15803 if (M < 0)
15804 continue;
15805 int Lane = (M % NumElts) / NumLaneElts;
15806 if ((0 <= SrcLane) && (SrcLane != Lane))
15807 return SDValue();
15808 SrcLane = Lane;
15809 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15810 SubLaneMask[Elt] = LocalM;
15811 }
15812
15813 // Whole sub-lane is UNDEF.
15814 if (SrcLane < 0)
15815 continue;
15816
15817 // Attempt to match against the candidate repeated sub-lane masks.
15818 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15819 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15820 for (int i = 0; i != NumSubLaneElts; ++i) {
15821 if (M1[i] < 0 || M2[i] < 0)
15822 continue;
15823 if (M1[i] != M2[i])
15824 return false;
15825 }
15826 return true;
15827 };
15828
15829 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15830 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15831 continue;
15832
15833 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15834 for (int i = 0; i != NumSubLaneElts; ++i) {
15835 int M = SubLaneMask[i];
15836 if (M < 0)
15837 continue;
15838 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15839, __PRETTY_FUNCTION__))
15839 "Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15839, __PRETTY_FUNCTION__))
;
15840 RepeatedSubLaneMask[i] = M;
15841 }
15842
15843 // Track the top most source sub-lane - by setting the remaining to UNDEF
15844 // we can greatly simplify shuffle matching.
15845 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15846 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15847 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15848 break;
15849 }
15850
15851 // Bail if we failed to find a matching repeated sub-lane mask.
15852 if (Dst2SrcSubLanes[DstSubLane] < 0)
15853 return SDValue();
15854 }
15855 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15856, __PRETTY_FUNCTION__))
15856 "Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15856, __PRETTY_FUNCTION__))
;
15857
15858 // Create a repeating shuffle mask for the entire vector.
15859 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15860 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15861 int Lane = SubLane / SubLaneScale;
15862 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15863 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15864 int M = RepeatedSubLaneMask[Elt];
15865 if (M < 0)
15866 continue;
15867 int Idx = (SubLane * NumSubLaneElts) + Elt;
15868 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15869 }
15870 }
15871 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15872
15873 // Shuffle each source sub-lane to its destination.
15874 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15875 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15876 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15877 if (SrcSubLane < 0)
15878 continue;
15879 for (int j = 0; j != NumSubLaneElts; ++j)
15880 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15881 }
15882
15883 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15884 SubLaneMask);
15885}
15886
15887static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
15888 bool &ForceV1Zero, bool &ForceV2Zero,
15889 unsigned &ShuffleImm, ArrayRef<int> Mask,
15890 const APInt &Zeroable) {
15891 int NumElts = VT.getVectorNumElements();
15892 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15894, __PRETTY_FUNCTION__))
15893 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15894, __PRETTY_FUNCTION__))
15894 "Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15894, __PRETTY_FUNCTION__))
;
15895 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15896, __PRETTY_FUNCTION__))
15896 "Illegal shuffle mask")((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15896, __PRETTY_FUNCTION__))
;
15897
15898 bool ZeroLane[2] = { true, true };
15899 for (int i = 0; i < NumElts; ++i)
15900 ZeroLane[i & 1] &= Zeroable[i];
15901
15902 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15903 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15904 ShuffleImm = 0;
15905 bool ShufpdMask = true;
15906 bool CommutableMask = true;
15907 for (int i = 0; i < NumElts; ++i) {
15908 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15909 continue;
15910 if (Mask[i] < 0)
15911 return false;
15912 int Val = (i & 6) + NumElts * (i & 1);
15913 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15914 if (Mask[i] < Val || Mask[i] > Val + 1)
15915 ShufpdMask = false;
15916 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15917 CommutableMask = false;
15918 ShuffleImm |= (Mask[i] % 2) << i;
15919 }
15920
15921 if (!ShufpdMask && !CommutableMask)
15922 return false;
15923
15924 if (!ShufpdMask && CommutableMask)
15925 std::swap(V1, V2);
15926
15927 ForceV1Zero = ZeroLane[0];
15928 ForceV2Zero = ZeroLane[1];
15929 return true;
15930}
15931
15932static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
15933 SDValue V2, ArrayRef<int> Mask,
15934 const APInt &Zeroable,
15935 const X86Subtarget &Subtarget,
15936 SelectionDAG &DAG) {
15937 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15938, __PRETTY_FUNCTION__))
15938 "Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15938, __PRETTY_FUNCTION__))
;
15939
15940 unsigned Immediate = 0;
15941 bool ForceV1Zero = false, ForceV2Zero = false;
15942 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15943 Mask, Zeroable))
15944 return SDValue();
15945
15946 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15947 if (ForceV1Zero)
15948 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15949 if (ForceV2Zero)
15950 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15951
15952 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15953 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15954}
15955
15956// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15957// by zeroable elements in the remaining 24 elements. Turn this into two
15958// vmovqb instructions shuffled together.
15959static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
15960 SDValue V1, SDValue V2,
15961 ArrayRef<int> Mask,
15962 const APInt &Zeroable,
15963 SelectionDAG &DAG) {
15964 assert(VT == MVT::v32i8 && "Unexpected type!")((VT == MVT::v32i8 && "Unexpected type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15964, __PRETTY_FUNCTION__))
;
15965
15966 // The first 8 indices should be every 8th element.
15967 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15968 return SDValue();
15969
15970 // Remaining elements need to be zeroable.
15971 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
15972 return SDValue();
15973
15974 V1 = DAG.getBitcast(MVT::v4i64, V1);
15975 V2 = DAG.getBitcast(MVT::v4i64, V2);
15976
15977 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15978 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15979
15980 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15981 // the upper bits of the result using an unpckldq.
15982 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15983 { 0, 1, 2, 3, 16, 17, 18, 19,
15984 4, 5, 6, 7, 20, 21, 22, 23 });
15985 // Insert the unpckldq into a zero vector to widen to v32i8.
15986 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15987 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15988 DAG.getIntPtrConstant(0, DL));
15989}
15990
15991
15992/// Handle lowering of 4-lane 64-bit floating point shuffles.
15993///
15994/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15995/// isn't available.
15996static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15997 const APInt &Zeroable, SDValue V1, SDValue V2,
15998 const X86Subtarget &Subtarget,
15999 SelectionDAG &DAG) {
16000 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16000, __PRETTY_FUNCTION__))
;
16001 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16001, __PRETTY_FUNCTION__))
;
16002 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16002, __PRETTY_FUNCTION__))
;
16003
16004 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16005 Subtarget, DAG))
16006 return V;
16007
16008 if (V2.isUndef()) {
16009 // Check for being able to broadcast a single element.
16010 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16011 Mask, Subtarget, DAG))
16012 return Broadcast;
16013
16014 // Use low duplicate instructions for masks that match their pattern.
16015 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
16016 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16017
16018 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16019 // Non-half-crossing single input shuffles can be lowered with an
16020 // interleaved permutation.
16021 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16022 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16023 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16024 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16025 }
16026
16027 // With AVX2 we have direct support for this permutation.
16028 if (Subtarget.hasAVX2())
16029 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16030 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16031
16032 // Try to create an in-lane repeating shuffle mask and then shuffle the
16033 // results into the target lanes.
16034 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16035 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16036 return V;
16037
16038 // Try to permute the lanes and then use a per-lane permute.
16039 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16040 Mask, DAG, Subtarget))
16041 return V;
16042
16043 // Otherwise, fall back.
16044 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16045 DAG, Subtarget);
16046 }
16047
16048 // Use dedicated unpack instructions for masks that match their pattern.
16049 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16050 return V;
16051
16052 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16053 Zeroable, Subtarget, DAG))
16054 return Blend;
16055
16056 // Check if the blend happens to exactly fit that of SHUFPD.
16057 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16058 Zeroable, Subtarget, DAG))
16059 return Op;
16060
16061 // If we have lane crossing shuffles AND they don't all come from the lower
16062 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16063 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16064 // canonicalize to a blend of splat which isn't necessary for this combine.
16065 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16066 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16067 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16068 (V2.getOpcode() != ISD::BUILD_VECTOR))
16069 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16070 Mask, DAG))
16071 return Op;
16072
16073 // If we have one input in place, then we can permute the other input and
16074 // blend the result.
16075 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16076 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
16077 Subtarget, DAG);
16078
16079 // Try to create an in-lane repeating shuffle mask and then shuffle the
16080 // results into the target lanes.
16081 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16082 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16083 return V;
16084
16085 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16086 // shuffle. However, if we have AVX2 and either inputs are already in place,
16087 // we will be able to shuffle even across lanes the other input in a single
16088 // instruction so skip this pattern.
16089 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16090 isShuffleMaskInputInPlace(1, Mask))))
16091 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16092 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16093 return V;
16094
16095 // If we have VLX support, we can use VEXPAND.
16096 if (Subtarget.hasVLX())
16097 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16098 DAG, Subtarget))
16099 return V;
16100
16101 // If we have AVX2 then we always want to lower with a blend because an v4 we
16102 // can fully permute the elements.
16103 if (Subtarget.hasAVX2())
16104 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
16105 Subtarget, DAG);
16106
16107 // Otherwise fall back on generic lowering.
16108 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16109 Subtarget, DAG);
16110}
16111
16112/// Handle lowering of 4-lane 64-bit integer shuffles.
16113///
16114/// This routine is only called when we have AVX2 and thus a reasonable
16115/// instruction set for v4i64 shuffling..
16116static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16117 const APInt &Zeroable, SDValue V1, SDValue V2,
16118 const X86Subtarget &Subtarget,
16119 SelectionDAG &DAG) {
16120 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16120, __PRETTY_FUNCTION__))
;
16121 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16121, __PRETTY_FUNCTION__))
;
16122 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16122, __PRETTY_FUNCTION__))
;
16123 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16123, __PRETTY_FUNCTION__))
;
16124
16125 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16126 Subtarget, DAG))
16127 return V;
16128
16129 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16130 Zeroable, Subtarget, DAG))
16131 return Blend;
16132
16133 // Check for being able to broadcast a single element.
16134 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16135 Subtarget, DAG))
16136 return Broadcast;
16137
16138 if (V2.isUndef()) {
16139 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16140 // can use lower latency instructions that will operate on both lanes.
16141 SmallVector<int, 2> RepeatedMask;
16142 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16143 SmallVector<int, 4> PSHUFDMask;
16144 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
16145 return DAG.getBitcast(
16146 MVT::v4i64,
16147 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16148 DAG.getBitcast(MVT::v8i32, V1),
16149 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16150 }
16151
16152 // AVX2 provides a direct instruction for permuting a single input across
16153 // lanes.
16154 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16155 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16156 }
16157
16158 // Try to use shift instructions.
16159 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
16160 Zeroable, Subtarget, DAG))
16161 return Shift;
16162
16163 // If we have VLX support, we can use VALIGN or VEXPAND.
16164 if (Subtarget.hasVLX()) {
16165 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16166 Subtarget, DAG))
16167 return Rotate;
16168
16169 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16170 DAG, Subtarget))
16171 return V;
16172 }
16173
16174 // Try to use PALIGNR.
16175 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16176 Subtarget, DAG))
16177 return Rotate;
16178
16179 // Use dedicated unpack instructions for masks that match their pattern.
16180 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16181 return V;
16182
16183 // If we have one input in place, then we can permute the other input and
16184 // blend the result.
16185 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16186 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
16187 Subtarget, DAG);
16188
16189 // Try to create an in-lane repeating shuffle mask and then shuffle the
16190 // results into the target lanes.
16191 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16192 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16193 return V;
16194
16195 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16196 // shuffle. However, if we have AVX2 and either inputs are already in place,
16197 // we will be able to shuffle even across lanes the other input in a single
16198 // instruction so skip this pattern.
16199 if (!isShuffleMaskInputInPlace(0, Mask) &&
16200 !isShuffleMaskInputInPlace(1, Mask))
16201 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16202 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16203 return Result;
16204
16205 // Otherwise fall back on generic blend lowering.
16206 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
16207 Subtarget, DAG);
16208}
16209
16210/// Handle lowering of 8-lane 32-bit floating point shuffles.
16211///
16212/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16213/// isn't available.
16214static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16215 const APInt &Zeroable, SDValue V1, SDValue V2,
16216 const X86Subtarget &Subtarget,
16217 SelectionDAG &DAG) {
16218 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16218, __PRETTY_FUNCTION__))
;
16219 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16219, __PRETTY_FUNCTION__))
;
16220 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16220, __PRETTY_FUNCTION__))
;
16221
16222 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16223 Zeroable, Subtarget, DAG))
16224 return Blend;
16225
16226 // Check for being able to broadcast a single element.
16227 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16228 Subtarget, DAG))
16229 return Broadcast;
16230
16231 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16232 // options to efficiently lower the shuffle.
16233 SmallVector<int, 4> RepeatedMask;
16234 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16235 assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16236, __PRETTY_FUNCTION__))
16236 "Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16236, __PRETTY_FUNCTION__))
;
16237
16238 // Use even/odd duplicate instructions for masks that match their pattern.
16239 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
16240 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16241 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
16242 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16243
16244 if (V2.isUndef())
16245 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16246 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16247
16248 // Use dedicated unpack instructions for masks that match their pattern.
16249 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16250 return V;
16251
16252 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16253 // have already handled any direct blends.
16254 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16255 }
16256
16257 // Try to create an in-lane repeating shuffle mask and then shuffle the
16258 // results into the target lanes.
16259 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16260 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16261 return V;
16262
16263 // If we have a single input shuffle with different shuffle patterns in the
16264 // two 128-bit lanes use the variable mask to VPERMILPS.
16265 if (V2.isUndef()) {
16266 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16267 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16268 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16269 }
16270 if (Subtarget.hasAVX2()) {
16271 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16272 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16273 }
16274 // Otherwise, fall back.
16275 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16276 DAG, Subtarget);
16277 }
16278
16279 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16280 // shuffle.
16281 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16282 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16283 return Result;
16284
16285 // If we have VLX support, we can use VEXPAND.
16286 if (Subtarget.hasVLX())
16287 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16288 DAG, Subtarget))
16289 return V;
16290
16291 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16292 // since after split we get a more efficient code using vpunpcklwd and
16293 // vpunpckhwd instrs than vblend.
16294 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
16295 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16296 Subtarget, DAG))
16297 return V;
16298
16299 // If we have AVX2 then we always want to lower with a blend because at v8 we
16300 // can fully permute the elements.
16301 if (Subtarget.hasAVX2())
16302 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
16303 Subtarget, DAG);
16304
16305 // Otherwise fall back on generic lowering.
16306 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16307 Subtarget, DAG);
16308}
16309
16310/// Handle lowering of 8-lane 32-bit integer shuffles.
16311///
16312/// This routine is only called when we have AVX2 and thus a reasonable
16313/// instruction set for v8i32 shuffling..
16314static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16315 const APInt &Zeroable, SDValue V1, SDValue V2,
16316 const X86Subtarget &Subtarget,
16317 SelectionDAG &DAG) {
16318 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16318, __PRETTY_FUNCTION__))
;
16319 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16319, __PRETTY_FUNCTION__))
;
16320 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16320, __PRETTY_FUNCTION__))
;
16321 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16321, __PRETTY_FUNCTION__))
;
16322
16323 // Whenever we can lower this as a zext, that instruction is strictly faster
16324 // than any alternative. It also allows us to fold memory operands into the
16325 // shuffle in many cases.
16326 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16327 Zeroable, Subtarget, DAG))
16328 return ZExt;
16329
16330 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16331 // since after split we get a more efficient code than vblend by using
16332 // vpunpcklwd and vpunpckhwd instrs.
16333 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
16334 !Subtarget.hasAVX512())
16335 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
16336 Subtarget, DAG))
16337 return V;
16338
16339 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16340 Zeroable, Subtarget, DAG))
16341 return Blend;
16342
16343 // Check for being able to broadcast a single element.
16344 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16345 Subtarget, DAG))
16346 return Broadcast;
16347
16348 // If the shuffle mask is repeated in each 128-bit lane we can use more
16349 // efficient instructions that mirror the shuffles across the two 128-bit
16350 // lanes.
16351 SmallVector<int, 4> RepeatedMask;
16352 bool Is128BitLaneRepeatedShuffle =
16353 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16354 if (Is128BitLaneRepeatedShuffle) {
16355 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16355, __PRETTY_FUNCTION__))
;
16356 if (V2.isUndef())
16357 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16358 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16359
16360 // Use dedicated unpack instructions for masks that match their pattern.
16361 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16362 return V;
16363 }
16364
16365 // Try to use shift instructions.
16366 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
16367 Zeroable, Subtarget, DAG))
16368 return Shift;
16369
16370 // If we have VLX support, we can use VALIGN or EXPAND.
16371 if (Subtarget.hasVLX()) {
16372 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16373 Subtarget, DAG))
16374 return Rotate;
16375
16376 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16377 DAG, Subtarget))
16378 return V;
16379 }
16380
16381 // Try to use byte rotation instructions.
16382 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16383 Subtarget, DAG))
16384 return Rotate;
16385
16386 // Try to create an in-lane repeating shuffle mask and then shuffle the
16387 // results into the target lanes.
16388 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16389 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16390 return V;
16391
16392 if (V2.isUndef()) {
16393 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16394 // because that should be faster than the variable permute alternatives.
16395 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16396 return V;
16397
16398 // If the shuffle patterns aren't repeated but it's a single input, directly
16399 // generate a cross-lane VPERMD instruction.
16400 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16401 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16402 }
16403
16404 // Assume that a single SHUFPS is faster than an alternative sequence of
16405 // multiple instructions (even if the CPU has a domain penalty).
16406 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16407 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16408 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16409 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16410 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16411 CastV1, CastV2, DAG);
16412 return DAG.getBitcast(MVT::v8i32, ShufPS);
16413 }
16414
16415 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16416 // shuffle.
16417 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16418 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16419 return Result;
16420
16421 // Otherwise fall back on generic blend lowering.
16422 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
16423 Subtarget, DAG);
16424}
16425
16426/// Handle lowering of 16-lane 16-bit integer shuffles.
16427///
16428/// This routine is only called when we have AVX2 and thus a reasonable
16429/// instruction set for v16i16 shuffling..
16430static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16431 const APInt &Zeroable, SDValue V1, SDValue V2,
16432 const X86Subtarget &Subtarget,
16433 SelectionDAG &DAG) {
16434 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16434, __PRETTY_FUNCTION__))
;
16435 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16435, __PRETTY_FUNCTION__))
;
16436 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16436, __PRETTY_FUNCTION__))
;
16437 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16437, __PRETTY_FUNCTION__))
;
16438
16439 // Whenever we can lower this as a zext, that instruction is strictly faster
16440 // than any alternative. It also allows us to fold memory operands into the
16441 // shuffle in many cases.
16442 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16443 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16444 return ZExt;
16445
16446 // Check for being able to broadcast a single element.
16447 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16448 Subtarget, DAG))
16449 return Broadcast;
16450
16451 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16452 Zeroable, Subtarget, DAG))
16453 return Blend;
16454
16455 // Use dedicated unpack instructions for masks that match their pattern.
16456 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16457 return V;
16458
16459 // Use dedicated pack instructions for masks that match their pattern.
16460 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16461 Subtarget))
16462 return V;
16463
16464 // Try to use shift instructions.
16465 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
16466 Zeroable, Subtarget, DAG))
16467 return Shift;
16468
16469 // Try to use byte rotation instructions.
16470 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16471 Subtarget, DAG))
16472 return Rotate;
16473
16474 // Try to create an in-lane repeating shuffle mask and then shuffle the
16475 // results into the target lanes.
16476 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16477 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16478 return V;
16479
16480 if (V2.isUndef()) {
16481 // Try to use bit rotation instructions.
16482 if (SDValue Rotate =
16483 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16484 return Rotate;
16485
16486 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16487 // because that should be faster than the variable permute alternatives.
16488 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16489 return V;
16490
16491 // There are no generalized cross-lane shuffle operations available on i16
16492 // element types.
16493 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16494 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16495 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16496 return V;
16497
16498 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16499 DAG, Subtarget);
16500 }
16501
16502 SmallVector<int, 8> RepeatedMask;
16503 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16504 // As this is a single-input shuffle, the repeated mask should be
16505 // a strictly valid v8i16 mask that we can pass through to the v8i16
16506 // lowering to handle even the v16 case.
16507 return lowerV8I16GeneralSingleInputShuffle(
16508 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16509 }
16510 }
16511
16512 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16513 Zeroable, Subtarget, DAG))
16514 return PSHUFB;
16515
16516 // AVX512BWVL can lower to VPERMW.
16517 if (Subtarget.hasBWI() && Subtarget.hasVLX())
16518 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
16519
16520 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16521 // shuffle.
16522 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16523 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16524 return Result;
16525
16526 // Try to permute the lanes and then use a per-lane permute.
16527 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16528 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16529 return V;
16530
16531 // Otherwise fall back on generic lowering.
16532 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16533 Subtarget, DAG);
16534}
16535
16536/// Handle lowering of 32-lane 8-bit integer shuffles.
16537///
16538/// This routine is only called when we have AVX2 and thus a reasonable
16539/// instruction set for v32i8 shuffling..
16540static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16541 const APInt &Zeroable, SDValue V1, SDValue V2,
16542 const X86Subtarget &Subtarget,
16543 SelectionDAG &DAG) {
16544 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16544, __PRETTY_FUNCTION__))
;
16545 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16545, __PRETTY_FUNCTION__))
;
16546 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16546, __PRETTY_FUNCTION__))
;
16547 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16547, __PRETTY_FUNCTION__))
;
16548
16549 // Whenever we can lower this as a zext, that instruction is strictly faster
16550 // than any alternative. It also allows us to fold memory operands into the
16551 // shuffle in many cases.
16552 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16553 Zeroable, Subtarget, DAG))
16554 return ZExt;
16555
16556 // Check for being able to broadcast a single element.
16557 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16558 Subtarget, DAG))
16559 return Broadcast;
16560
16561 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16562 Zeroable, Subtarget, DAG))
16563 return Blend;
16564
16565 // Use dedicated unpack instructions for masks that match their pattern.
16566 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16567 return V;
16568
16569 // Use dedicated pack instructions for masks that match their pattern.
16570 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16571 Subtarget))
16572 return V;
16573
16574 // Try to use shift instructions.
16575 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
16576 Zeroable, Subtarget, DAG))
16577 return Shift;
16578
16579 // Try to use byte rotation instructions.
16580 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16581 Subtarget, DAG))
16582 return Rotate;
16583
16584 // Try to use bit rotation instructions.
16585 if (V2.isUndef())
16586 if (SDValue Rotate =
16587 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16588 return Rotate;
16589
16590 // Try to create an in-lane repeating shuffle mask and then shuffle the
16591 // results into the target lanes.
16592 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16593 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16594 return V;
16595
16596 // There are no generalized cross-lane shuffle operations available on i8
16597 // element types.
16598 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16599 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16600 // because that should be faster than the variable permute alternatives.
16601 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16602 return V;
16603
16604 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16605 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16606 return V;
16607
16608 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16609 DAG, Subtarget);
16610 }
16611
16612 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16613 Zeroable, Subtarget, DAG))
16614 return PSHUFB;
16615
16616 // AVX512VBMIVL can lower to VPERMB.
16617 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
16618 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
16619
16620 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16621 // shuffle.
16622 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16623 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16624 return Result;
16625
16626 // Try to permute the lanes and then use a per-lane permute.
16627 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16628 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16629 return V;
16630
16631 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16632 // by zeroable elements in the remaining 24 elements. Turn this into two
16633 // vmovqb instructions shuffled together.
16634 if (Subtarget.hasVLX())
16635 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16636 Mask, Zeroable, DAG))
16637 return V;
16638
16639 // Otherwise fall back on generic lowering.
16640 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16641 Subtarget, DAG);
16642}
16643
16644/// High-level routine to lower various 256-bit x86 vector shuffles.
16645///
16646/// This routine either breaks down the specific type of a 256-bit x86 vector
16647/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16648/// together based on the available instructions.
16649static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16650 SDValue V1, SDValue V2, const APInt &Zeroable,
16651 const X86Subtarget &Subtarget,
16652 SelectionDAG &DAG) {
16653 // If we have a single input to the zero element, insert that into V1 if we
16654 // can do so cheaply.
16655 int NumElts = VT.getVectorNumElements();
16656 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16657
16658 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16659 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16660 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16661 return Insertion;
16662
16663 // Handle special cases where the lower or upper half is UNDEF.
16664 if (SDValue V =
16665 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16666 return V;
16667
16668 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16669 // can check for those subtargets here and avoid much of the subtarget
16670 // querying in the per-vector-type lowering routines. With AVX1 we have
16671 // essentially *zero* ability to manipulate a 256-bit vector with integer
16672 // types. Since we'll use floating point types there eventually, just
16673 // immediately cast everything to a float and operate entirely in that domain.
16674 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16675 int ElementBits = VT.getScalarSizeInBits();
16676 if (ElementBits < 32) {
16677 // No floating point type available, if we can't use the bit operations
16678 // for masking/blending then decompose into 128-bit vectors.
16679 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16680 Subtarget, DAG))
16681 return V;
16682 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16683 return V;
16684 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16685 }
16686
16687 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16688 VT.getVectorNumElements());
16689 V1 = DAG.getBitcast(FpVT, V1);
16690 V2 = DAG.getBitcast(FpVT, V2);
16691 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16692 }
16693
16694 switch (VT.SimpleTy) {
16695 case MVT::v4f64:
16696 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16697 case MVT::v4i64:
16698 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16699 case MVT::v8f32:
16700 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16701 case MVT::v8i32:
16702 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16703 case MVT::v16i16:
16704 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16705 case MVT::v32i8:
16706 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16707
16708 default:
16709 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16709)
;
16710 }
16711}
16712
16713/// Try to lower a vector shuffle as a 128-bit shuffles.
16714static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
16715 const APInt &Zeroable, SDValue V1, SDValue V2,
16716 const X86Subtarget &Subtarget,
16717 SelectionDAG &DAG) {
16718 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16719, __PRETTY_FUNCTION__))
16719 "Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16719, __PRETTY_FUNCTION__))
;
16720
16721 // To handle 256 bit vector requires VLX and most probably
16722 // function lowerV2X128VectorShuffle() is better solution.
16723 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16723, __PRETTY_FUNCTION__))
;
16724
16725 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16726 SmallVector<int, 4> WidenedMask;
16727 if (!canWidenShuffleElements(Mask, WidenedMask))
16728 return SDValue();
16729
16730 // Try to use an insert into a zero vector.
16731 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16732 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16733 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16734 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16735 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16736 DAG.getIntPtrConstant(0, DL));
16737 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16738 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16739 DAG.getIntPtrConstant(0, DL));
16740 }
16741
16742 // Check for patterns which can be matched with a single insert of a 256-bit
16743 // subvector.
16744 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
16745 {0, 1, 2, 3, 0, 1, 2, 3});
16746 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
16747 {0, 1, 2, 3, 8, 9, 10, 11})) {
16748 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16749 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16750 OnlyUsesV1 ? V1 : V2,
16751 DAG.getIntPtrConstant(0, DL));
16752 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16753 DAG.getIntPtrConstant(4, DL));
16754 }
16755
16756 assert(WidenedMask.size() == 4)((WidenedMask.size() == 4) ? static_cast<void> (0) : __assert_fail
("WidenedMask.size() == 4", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16756, __PRETTY_FUNCTION__))
;
16757
16758 // See if this is an insertion of the lower 128-bits of V2 into V1.
16759 bool IsInsert = true;
16760 int V2Index = -1;
16761 for (int i = 0; i < 4; ++i) {
16762 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16762, __PRETTY_FUNCTION__))
;
16763 if (WidenedMask[i] < 0)
16764 continue;
16765
16766 // Make sure all V1 subvectors are in place.
16767 if (WidenedMask[i] < 4) {
16768 if (WidenedMask[i] != i) {
16769 IsInsert = false;
16770 break;
16771 }
16772 } else {
16773 // Make sure we only have a single V2 index and its the lowest 128-bits.
16774 if (V2Index >= 0 || WidenedMask[i] != 4) {
16775 IsInsert = false;
16776 break;
16777 }
16778 V2Index = i;
16779 }
16780 }
16781 if (IsInsert && V2Index >= 0) {
16782 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16783 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16784 DAG.getIntPtrConstant(0, DL));
16785 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16786 }
16787
16788 // Try to lower to vshuf64x2/vshuf32x4.
16789 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16790 unsigned PermMask = 0;
16791 // Insure elements came from the same Op.
16792 for (int i = 0; i < 4; ++i) {
16793 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16793, __PRETTY_FUNCTION__))
;
16794 if (WidenedMask[i] < 0)
16795 continue;
16796
16797 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
16798 unsigned OpIndex = i / 2;
16799 if (Ops[OpIndex].isUndef())
16800 Ops[OpIndex] = Op;
16801 else if (Ops[OpIndex] != Op)
16802 return SDValue();
16803
16804 // Convert the 128-bit shuffle mask selection values into 128-bit selection
16805 // bits defined by a vshuf64x2 instruction's immediate control byte.
16806 PermMask |= (WidenedMask[i] % 4) << (i * 2);
16807 }
16808
16809 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16810 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16811}
16812
16813/// Handle lowering of 8-lane 64-bit floating point shuffles.
16814static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16815 const APInt &Zeroable, SDValue V1, SDValue V2,
16816 const X86Subtarget &Subtarget,
16817 SelectionDAG &DAG) {
16818 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16818, __PRETTY_FUNCTION__))
;
16819 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16819, __PRETTY_FUNCTION__))
;
16820 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16820, __PRETTY_FUNCTION__))
;
16821
16822 if (V2.isUndef()) {
16823 // Use low duplicate instructions for masks that match their pattern.
16824 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
16825 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16826
16827 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16828 // Non-half-crossing single input shuffles can be lowered with an
16829 // interleaved permutation.
16830 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16831 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16832 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16833 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16834 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16835 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16836 }
16837
16838 SmallVector<int, 4> RepeatedMask;
16839 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16840 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16841 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16842 }
16843
16844 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16845 V2, Subtarget, DAG))
16846 return Shuf128;
16847
16848 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16849 return Unpck;
16850
16851 // Check if the blend happens to exactly fit that of SHUFPD.
16852 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16853 Zeroable, Subtarget, DAG))
16854 return Op;
16855
16856 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16857 DAG, Subtarget))
16858 return V;
16859
16860 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16861 Zeroable, Subtarget, DAG))
16862 return Blend;
16863
16864 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
16865}
16866
16867/// Handle lowering of 16-lane 32-bit floating point shuffles.
16868static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16869 const APInt &Zeroable, SDValue V1, SDValue V2,
16870 const X86Subtarget &Subtarget,
16871 SelectionDAG &DAG) {
16872 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16872, __PRETTY_FUNCTION__))
;
16873 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16873, __PRETTY_FUNCTION__))
;
16874 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16874, __PRETTY_FUNCTION__))
;
16875
16876 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16877 // options to efficiently lower the shuffle.
16878 SmallVector<int, 4> RepeatedMask;
16879 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16880 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16880, __PRETTY_FUNCTION__))
;
16881
16882 // Use even/odd duplicate instructions for masks that match their pattern.
16883 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
16884 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16885 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
16886 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16887
16888 if (V2.isUndef())
16889 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16890 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16891
16892 // Use dedicated unpack instructions for masks that match their pattern.
16893 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16894 return V;
16895
16896 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16897 Zeroable, Subtarget, DAG))
16898 return Blend;
16899
16900 // Otherwise, fall back to a SHUFPS sequence.
16901 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16902 }
16903
16904 // If we have a single input shuffle with different shuffle patterns in the
16905 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16906 if (V2.isUndef() &&
16907 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16908 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16909 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16910 }
16911
16912 // If we have AVX512F support, we can use VEXPAND.
16913 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16914 V1, V2, DAG, Subtarget))
16915 return V;
16916
16917 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
16918}
16919
16920/// Handle lowering of 8-lane 64-bit integer shuffles.
16921static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16922 const APInt &Zeroable, SDValue V1, SDValue V2,
16923 const X86Subtarget &Subtarget,
16924 SelectionDAG &DAG) {
16925 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16925, __PRETTY_FUNCTION__))
;
16926 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16926, __PRETTY_FUNCTION__))
;
16927 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16927, __PRETTY_FUNCTION__))
;
16928
16929 if (V2.isUndef()) {
16930 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16931 // can use lower latency instructions that will operate on all four
16932 // 128-bit lanes.
16933 SmallVector<int, 2> Repeated128Mask;
16934 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16935 SmallVector<int, 4> PSHUFDMask;
16936 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
16937 return DAG.getBitcast(
16938 MVT::v8i64,
16939 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16940 DAG.getBitcast(MVT::v16i32, V1),
16941 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16942 }
16943
16944 SmallVector<int, 4> Repeated256Mask;
16945 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16946 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16947 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16948 }
16949
16950 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16951 V2, Subtarget, DAG))
16952 return Shuf128;
16953
16954 // Try to use shift instructions.
16955 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
16956 Zeroable, Subtarget, DAG))
16957 return Shift;
16958
16959 // Try to use VALIGN.
16960 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16961 Subtarget, DAG))
16962 return Rotate;
16963
16964 // Try to use PALIGNR.
16965 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16966 Subtarget, DAG))
16967 return Rotate;
16968
16969 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16970 return Unpck;
16971 // If we have AVX512F support, we can use VEXPAND.
16972 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16973 DAG, Subtarget))
16974 return V;
16975
16976 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16977 Zeroable, Subtarget, DAG))
16978 return Blend;
16979
16980 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
16981}
16982
16983/// Handle lowering of 16-lane 32-bit integer shuffles.
16984static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16985 const APInt &Zeroable, SDValue V1, SDValue V2,
16986 const X86Subtarget &Subtarget,
16987 SelectionDAG &DAG) {
16988 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16988, __PRETTY_FUNCTION__))
;
16989 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16989, __PRETTY_FUNCTION__))
;
16990 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16990, __PRETTY_FUNCTION__))
;
16991
16992 // Whenever we can lower this as a zext, that instruction is strictly faster
16993 // than any alternative. It also allows us to fold memory operands into the
16994 // shuffle in many cases.
16995 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16996 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16997 return ZExt;
16998
16999 // If the shuffle mask is repeated in each 128-bit lane we can use more
17000 // efficient instructions that mirror the shuffles across the four 128-bit
17001 // lanes.
17002 SmallVector<int, 4> RepeatedMask;
17003 bool Is128BitLaneRepeatedShuffle =
17004 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17005 if (Is128BitLaneRepeatedShuffle) {
17006 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17006, __PRETTY_FUNCTION__))
;
17007 if (V2.isUndef())
17008 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17009 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17010
17011 // Use dedicated unpack instructions for masks that match their pattern.
17012 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17013 return V;
17014 }
17015
17016 // Try to use shift instructions.
17017 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17018 Zeroable, Subtarget, DAG))
17019 return Shift;
17020
17021 // Try to use VALIGN.
17022 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17023 Subtarget, DAG))
17024 return Rotate;
17025
17026 // Try to use byte rotation instructions.
17027 if (Subtarget.hasBWI())
17028 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17029 Subtarget, DAG))
17030 return Rotate;
17031
17032 // Assume that a single SHUFPS is faster than using a permv shuffle.
17033 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17034 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17035 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17036 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17037 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17038 CastV1, CastV2, DAG);
17039 return DAG.getBitcast(MVT::v16i32, ShufPS);
17040 }
17041 // If we have AVX512F support, we can use VEXPAND.
17042 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17043 DAG, Subtarget))
17044 return V;
17045
17046 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17047 Zeroable, Subtarget, DAG))
17048 return Blend;
17049 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
17050}
17051
17052/// Handle lowering of 32-lane 16-bit integer shuffles.
17053static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17054 const APInt &Zeroable, SDValue V1, SDValue V2,
17055 const X86Subtarget &Subtarget,
17056 SelectionDAG &DAG) {
17057 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17057, __PRETTY_FUNCTION__))
;
17058 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17058, __PRETTY_FUNCTION__))
;
17059 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17059, __PRETTY_FUNCTION__))
;
17060 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17060, __PRETTY_FUNCTION__))
;
17061
17062 // Whenever we can lower this as a zext, that instruction is strictly faster
17063 // than any alternative. It also allows us to fold memory operands into the
17064 // shuffle in many cases.
17065 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17066 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17067 return ZExt;
17068
17069 // Use dedicated unpack instructions for masks that match their pattern.
17070 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17071 return V;
17072
17073 // Try to use shift instructions.
17074 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17075 Zeroable, Subtarget, DAG))
17076 return Shift;
17077
17078 // Try to use byte rotation instructions.
17079 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17080 Subtarget, DAG))
17081 return Rotate;
17082
17083 if (V2.isUndef()) {
17084 // Try to use bit rotation instructions.
17085 if (SDValue Rotate =
17086 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17087 return Rotate;
17088
17089 SmallVector<int, 8> RepeatedMask;
17090 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17091 // As this is a single-input shuffle, the repeated mask should be
17092 // a strictly valid v8i16 mask that we can pass through to the v8i16
17093 // lowering to handle even the v32 case.
17094 return lowerV8I16GeneralSingleInputShuffle(
17095 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
17096 }
17097 }
17098
17099 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17100 Zeroable, Subtarget, DAG))
17101 return Blend;
17102
17103 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17104 Zeroable, Subtarget, DAG))
17105 return PSHUFB;
17106
17107 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
17108}
17109
17110/// Handle lowering of 64-lane 8-bit integer shuffles.
17111static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17112 const APInt &Zeroable, SDValue V1, SDValue V2,
17113 const X86Subtarget &Subtarget,
17114 SelectionDAG &DAG) {
17115 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17115, __PRETTY_FUNCTION__))
;
17116 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17116, __PRETTY_FUNCTION__))
;
17117 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17117, __PRETTY_FUNCTION__))
;
17118 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17118, __PRETTY_FUNCTION__))
;
17119
17120 // Whenever we can lower this as a zext, that instruction is strictly faster
17121 // than any alternative. It also allows us to fold memory operands into the
17122 // shuffle in many cases.
17123 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17124 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17125 return ZExt;
17126
17127 // Use dedicated unpack instructions for masks that match their pattern.
17128 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17129 return V;
17130
17131 // Use dedicated pack instructions for masks that match their pattern.
17132 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17133 Subtarget))
17134 return V;
17135
17136 // Try to use shift instructions.
17137 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
17138 Zeroable, Subtarget, DAG))
17139 return Shift;
17140
17141 // Try to use byte rotation instructions.
17142 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17143 Subtarget, DAG))
17144 return Rotate;
17145
17146 // Try to use bit rotation instructions.
17147 if (V2.isUndef())
17148 if (SDValue Rotate =
17149 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17150 return Rotate;
17151
17152 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17153 Zeroable, Subtarget, DAG))
17154 return PSHUFB;
17155
17156 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17157 if (Subtarget.hasVBMI())
17158 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
17159
17160 // Try to create an in-lane repeating shuffle mask and then shuffle the
17161 // results into the target lanes.
17162 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17163 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17164 return V;
17165
17166 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17167 Zeroable, Subtarget, DAG))
17168 return Blend;
17169
17170 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17171 // shuffle.
17172 if (!V2.isUndef())
17173 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17174 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17175 return Result;
17176
17177 // FIXME: Implement direct support for this type!
17178 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
17179}
17180
17181/// High-level routine to lower various 512-bit x86 vector shuffles.
17182///
17183/// This routine either breaks down the specific type of a 512-bit x86 vector
17184/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17185/// together based on the available instructions.
17186static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17187 MVT VT, SDValue V1, SDValue V2,
17188 const APInt &Zeroable,
17189 const X86Subtarget &Subtarget,
17190 SelectionDAG &DAG) {
17191 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17192, __PRETTY_FUNCTION__))
17192 "Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17192, __PRETTY_FUNCTION__))
;
17193
17194 // If we have a single input to the zero element, insert that into V1 if we
17195 // can do so cheaply.
17196 int NumElts = Mask.size();
17197 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17198
17199 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17200 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17201 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17202 return Insertion;
17203
17204 // Handle special cases where the lower or upper half is UNDEF.
17205 if (SDValue V =
17206 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17207 return V;
17208
17209 // Check for being able to broadcast a single element.
17210 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17211 Subtarget, DAG))
17212 return Broadcast;
17213
17214 // Dispatch to each element type for lowering. If we don't have support for
17215 // specific element type shuffles at 512 bits, immediately split them and
17216 // lower them. Each lowering routine of a given type is allowed to assume that
17217 // the requisite ISA extensions for that element type are available.
17218 switch (VT.SimpleTy) {
17219 case MVT::v8f64:
17220 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17221 case MVT::v16f32:
17222 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17223 case MVT::v8i64:
17224 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17225 case MVT::v16i32:
17226 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17227 case MVT::v32i16:
17228 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17229 case MVT::v64i8:
17230 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17231
17232 default:
17233 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17233)
;
17234 }
17235}
17236
17237static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
17238 MVT VT, SDValue V1, SDValue V2,
17239 const X86Subtarget &Subtarget,
17240 SelectionDAG &DAG) {
17241 // Shuffle should be unary.
17242 if (!V2.isUndef())
17243 return SDValue();
17244
17245 int ShiftAmt = -1;
17246 int NumElts = Mask.size();
17247 for (int i = 0; i != NumElts; ++i) {
17248 int M = Mask[i];
17249 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17250, __PRETTY_FUNCTION__))
17250 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17250, __PRETTY_FUNCTION__))
;
17251 if (M < 0)
17252 continue;
17253
17254 // The first non-undef element determines our shift amount.
17255 if (ShiftAmt < 0) {
17256 ShiftAmt = M - i;
17257 // Need to be shifting right.
17258 if (ShiftAmt <= 0)
17259 return SDValue();
17260 }
17261 // All non-undef elements must shift by the same amount.
17262 if (ShiftAmt != M - i)
17263 return SDValue();
17264 }
17265 assert(ShiftAmt >= 0 && "All undef?")((ShiftAmt >= 0 && "All undef?") ? static_cast<
void> (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17265, __PRETTY_FUNCTION__))
;
17266
17267 // Great we found a shift right.
17268 MVT WideVT = VT;
17269 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
17270 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17271 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
17272 DAG.getUNDEF(WideVT), V1,
17273 DAG.getIntPtrConstant(0, DL));
17274 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
17275 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17277 DAG.getIntPtrConstant(0, DL));
17278}
17279
17280// Determine if this shuffle can be implemented with a KSHIFT instruction.
17281// Returns the shift amount if possible or -1 if not. This is a simplified
17282// version of matchShuffleAsShift.
17283static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17284 int MaskOffset, const APInt &Zeroable) {
17285 int Size = Mask.size();
17286
17287 auto CheckZeros = [&](int Shift, bool Left) {
17288 for (int j = 0; j < Shift; ++j)
17289 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17290 return false;
17291
17292 return true;
17293 };
17294
17295 auto MatchShift = [&](int Shift, bool Left) {
17296 unsigned Pos = Left ? Shift : 0;
17297 unsigned Low = Left ? 0 : Shift;
17298 unsigned Len = Size - Shift;
17299 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17300 };
17301
17302 for (int Shift = 1; Shift != Size; ++Shift)
17303 for (bool Left : {true, false})
17304 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17305 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
17306 return Shift;
17307 }
17308
17309 return -1;
17310}
17311
17312
17313// Lower vXi1 vector shuffles.
17314// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17315// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17316// vector, shuffle and then truncate it back.
17317static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17318 MVT VT, SDValue V1, SDValue V2,
17319 const APInt &Zeroable,
17320 const X86Subtarget &Subtarget,
17321 SelectionDAG &DAG) {
17322 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17323, __PRETTY_FUNCTION__))
17323 "Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17323, __PRETTY_FUNCTION__))
;
17324
17325 int NumElts = Mask.size();
17326
17327 // Try to recognize shuffles that are just padding a subvector with zeros.
17328 int SubvecElts = 0;
17329 int Src = -1;
17330 for (int i = 0; i != NumElts; ++i) {
17331 if (Mask[i] >= 0) {
17332 // Grab the source from the first valid mask. All subsequent elements need
17333 // to use this same source.
17334 if (Src < 0)
17335 Src = Mask[i] / NumElts;
17336 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17337 break;
17338 }
17339
17340 ++SubvecElts;
17341 }
17342 assert(SubvecElts != NumElts && "Identity shuffle?")((SubvecElts != NumElts && "Identity shuffle?") ? static_cast
<void> (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17342, __PRETTY_FUNCTION__))
;
17343
17344 // Clip to a power 2.
17345 SubvecElts = PowerOf2Floor(SubvecElts);
17346
17347 // Make sure the number of zeroable bits in the top at least covers the bits
17348 // not covered by the subvector.
17349 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
17350 assert(Src >= 0 && "Expected a source!")((Src >= 0 && "Expected a source!") ? static_cast<
void> (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17350, __PRETTY_FUNCTION__))
;
17351 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17352 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17353 Src == 0 ? V1 : V2,
17354 DAG.getIntPtrConstant(0, DL));
17355 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17356 DAG.getConstant(0, DL, VT),
17357 Extract, DAG.getIntPtrConstant(0, DL));
17358 }
17359
17360 // Try a simple shift right with undef elements. Later we'll try with zeros.
17361 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17362 DAG))
17363 return Shift;
17364
17365 // Try to match KSHIFTs.
17366 unsigned Offset = 0;
17367 for (SDValue V : { V1, V2 }) {
17368 unsigned Opcode;
17369 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17370 if (ShiftAmt >= 0) {
17371 MVT WideVT = VT;
17372 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
17373 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17374 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
17375 DAG.getUNDEF(WideVT), V,
17376 DAG.getIntPtrConstant(0, DL));
17377 // Widened right shifts need two shifts to ensure we shift in zeroes.
17378 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17379 int WideElts = WideVT.getVectorNumElements();
17380 // Shift left to put the original vector in the MSBs of the new size.
17381 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17382 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17383 // Increase the shift amount to account for the left shift.
17384 ShiftAmt += WideElts - NumElts;
17385 }
17386
17387 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17388 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17389 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17390 DAG.getIntPtrConstant(0, DL));
17391 }
17392 Offset += NumElts; // Increment for next iteration.
17393 }
17394
17395
17396
17397 MVT ExtVT;
17398 switch (VT.SimpleTy) {
17399 default:
17400 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17400)
;
17401 case MVT::v2i1:
17402 ExtVT = MVT::v2i64;
17403 break;
17404 case MVT::v4i1:
17405 ExtVT = MVT::v4i32;
17406 break;
17407 case MVT::v8i1:
17408 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17409 // shuffle.
17410 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17411 break;
17412 case MVT::v16i1:
17413 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17414 // 256-bit operation available.
17415 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17416 break;
17417 case MVT::v32i1:
17418 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17419 // 256-bit operation available.
17420 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((Subtarget.hasBWI() && "Expected AVX512BW support") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17420, __PRETTY_FUNCTION__))
;
17421 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17422 break;
17423 case MVT::v64i1:
17424 // Fall back to scalarization. FIXME: We can do better if the shuffle
17425 // can be partitioned cleanly.
17426 if (!Subtarget.useBWIRegs())
17427 return SDValue();
17428 ExtVT = MVT::v64i8;
17429 break;
17430 }
17431
17432 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17433 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17434
17435 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17436 // i1 was sign extended we can use X86ISD::CVT2MASK.
17437 int NumElems = VT.getVectorNumElements();
17438 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17439 (Subtarget.hasDQI() && (NumElems < 32)))
17440 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17441 Shuffle, ISD::SETGT);
17442
17443 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17444}
17445
17446/// Helper function that returns true if the shuffle mask should be
17447/// commuted to improve canonicalization.
17448static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
17449 int NumElements = Mask.size();
17450
17451 int NumV1Elements = 0, NumV2Elements = 0;
17452 for (int M : Mask)
17453 if (M < 0)
17454 continue;
17455 else if (M < NumElements)
17456 ++NumV1Elements;
17457 else
17458 ++NumV2Elements;
17459
17460 // Commute the shuffle as needed such that more elements come from V1 than
17461 // V2. This allows us to match the shuffle pattern strictly on how many
17462 // elements come from V1 without handling the symmetric cases.
17463 if (NumV2Elements > NumV1Elements)
17464 return true;
17465
17466 assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast
<void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17466, __PRETTY_FUNCTION__))
;
17467
17468 if (NumV2Elements == 0)
17469 return false;
17470
17471 // When the number of V1 and V2 elements are the same, try to minimize the
17472 // number of uses of V2 in the low half of the vector. When that is tied,
17473 // ensure that the sum of indices for V1 is equal to or lower than the sum
17474 // indices for V2. When those are equal, try to ensure that the number of odd
17475 // indices for V1 is lower than the number of odd indices for V2.
17476 if (NumV1Elements == NumV2Elements) {
17477 int LowV1Elements = 0, LowV2Elements = 0;
17478 for (int M : Mask.slice(0, NumElements / 2))
17479 if (M >= NumElements)
17480 ++LowV2Elements;
17481 else if (M >= 0)
17482 ++LowV1Elements;
17483 if (LowV2Elements > LowV1Elements)
17484 return true;
17485 if (LowV2Elements == LowV1Elements) {
17486 int SumV1Indices = 0, SumV2Indices = 0;
17487 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17488 if (Mask[i] >= NumElements)
17489 SumV2Indices += i;
17490 else if (Mask[i] >= 0)
17491 SumV1Indices += i;
17492 if (SumV2Indices < SumV1Indices)
17493 return true;
17494 if (SumV2Indices == SumV1Indices) {
17495 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17496 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17497 if (Mask[i] >= NumElements)
17498 NumV2OddIndices += i % 2;
17499 else if (Mask[i] >= 0)
17500 NumV1OddIndices += i % 2;
17501 if (NumV2OddIndices < NumV1OddIndices)
17502 return true;
17503 }
17504 }
17505 }
17506
17507 return false;
17508}
17509
17510/// Top-level lowering for x86 vector shuffles.
17511///
17512/// This handles decomposition, canonicalization, and lowering of all x86
17513/// vector shuffles. Most of the specific lowering strategies are encapsulated
17514/// above in helper routines. The canonicalization attempts to widen shuffles
17515/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17516/// s.t. only one of the two inputs needs to be tested, etc.
17517static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
17518 SelectionDAG &DAG) {
17519 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17520 ArrayRef<int> OrigMask = SVOp->getMask();
17521 SDValue V1 = Op.getOperand(0);
17522 SDValue V2 = Op.getOperand(1);
17523 MVT VT = Op.getSimpleValueType();
17524 int NumElements = VT.getVectorNumElements();
17525 SDLoc DL(Op);
17526 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17527
17528 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17529, __PRETTY_FUNCTION__))
17529 "Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17529, __PRETTY_FUNCTION__))
;
17530
17531 bool V1IsUndef = V1.isUndef();
17532 bool V2IsUndef = V2.isUndef();
17533 if (V1IsUndef && V2IsUndef)
17534 return DAG.getUNDEF(VT);
17535
17536 // When we create a shuffle node we put the UNDEF node to second operand,
17537 // but in some cases the first operand may be transformed to UNDEF.
17538 // In this case we should just commute the node.
17539 if (V1IsUndef)
17540 return DAG.getCommutedVectorShuffle(*SVOp);
17541
17542 // Check for non-undef masks pointing at an undef vector and make the masks
17543 // undef as well. This makes it easier to match the shuffle based solely on
17544 // the mask.
17545 if (V2IsUndef &&
17546 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17547 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
17548 for (int &M : NewMask)
17549 if (M >= NumElements)
17550 M = -1;
17551 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17552 }
17553
17554 // Check for illegal shuffle mask element index values.
17555 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17556 (void)MaskUpperLimit;
17557 assert(llvm::all_of(OrigMask,((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17559, __PRETTY_FUNCTION__))
17558 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17559, __PRETTY_FUNCTION__))
17559 "Out of bounds shuffle index")((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17559, __PRETTY_FUNCTION__))
;
17560
17561 // We actually see shuffles that are entirely re-arrangements of a set of
17562 // zero inputs. This mostly happens while decomposing complex shuffles into
17563 // simple ones. Directly lower these as a buildvector of zeros.
17564 APInt KnownUndef, KnownZero;
17565 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17566
17567 APInt Zeroable = KnownUndef | KnownZero;
17568 if (Zeroable.isAllOnesValue())
17569 return getZeroVector(VT, Subtarget, DAG, DL);
17570
17571 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17572
17573 // Try to collapse shuffles into using a vector type with fewer elements but
17574 // wider element types. We cap this to not form integers or floating point
17575 // elements wider than 64 bits, but it might be interesting to form i128
17576 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17577 SmallVector<int, 16> WidenedMask;
17578 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17579 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17580 // Shuffle mask widening should not interfere with a broadcast opportunity
17581 // by obfuscating the operands with bitcasts.
17582 // TODO: Avoid lowering directly from this top-level function: make this
17583 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17584 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17585 Subtarget, DAG))
17586 return Broadcast;
17587
17588 MVT NewEltVT = VT.isFloatingPoint()
17589 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
17590 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
17591 int NewNumElts = NumElements / 2;
17592 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17593 // Make sure that the new vector type is legal. For example, v2f64 isn't
17594 // legal on SSE1.
17595 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17596 if (V2IsZero) {
17597 // Modify the new Mask to take all zeros from the all-zero vector.
17598 // Choose indices that are blend-friendly.
17599 bool UsedZeroVector = false;
17600 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17601, __PRETTY_FUNCTION__))
17601 "V2's non-undef elements are used?!")((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17601, __PRETTY_FUNCTION__))
;
17602 for (int i = 0; i != NewNumElts; ++i)
17603 if (WidenedMask[i] == SM_SentinelZero) {
17604 WidenedMask[i] = i + NewNumElts;
17605 UsedZeroVector = true;
17606 }
17607 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17608 // some elements to be undef.
17609 if (UsedZeroVector)
17610 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17611 }
17612 V1 = DAG.getBitcast(NewVT, V1);
17613 V2 = DAG.getBitcast(NewVT, V2);
17614 return DAG.getBitcast(
17615 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17616 }
17617 }
17618
17619 // Commute the shuffle if it will improve canonicalization.
17620 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
17621 if (canonicalizeShuffleMaskWithCommute(Mask)) {
17622 ShuffleVectorSDNode::commuteMask(Mask);
17623 std::swap(V1, V2);
17624 }
17625
17626 if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
17627 return V;
17628
17629 // For each vector width, delegate to a specialized lowering routine.
17630 if (VT.is128BitVector())
17631 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17632
17633 if (VT.is256BitVector())
17634 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17635
17636 if (VT.is512BitVector())
17637 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17638
17639 if (Is1BitVector)
17640 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17641
17642 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17642)
;
17643}
17644
17645/// Try to lower a VSELECT instruction to a vector shuffle.
17646static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
17647 const X86Subtarget &Subtarget,
17648 SelectionDAG &DAG) {
17649 SDValue Cond = Op.getOperand(0);
17650 SDValue LHS = Op.getOperand(1);
17651 SDValue RHS = Op.getOperand(2);
17652 MVT VT = Op.getSimpleValueType();
17653
17654 // Only non-legal VSELECTs reach this lowering, convert those into generic
17655 // shuffles and re-use the shuffle lowering path for blends.
17656 SmallVector<int, 32> Mask;
17657 if (createShuffleMaskFromVSELECT(Mask, Cond))
17658 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17659
17660 return SDValue();
17661}
17662
17663SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17664 SDValue Cond = Op.getOperand(0);
17665 SDValue LHS = Op.getOperand(1);
17666 SDValue RHS = Op.getOperand(2);
17667
17668 // A vselect where all conditions and data are constants can be optimized into
17669 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17670 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
17671 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
17672 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
17673 return SDValue();
17674
17675 // Try to lower this to a blend-style vector shuffle. This can handle all
17676 // constant condition cases.
17677 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17678 return BlendOp;
17679
17680 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17681 // with patterns on the mask registers on AVX-512.
17682 MVT CondVT = Cond.getSimpleValueType();
17683 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17684 if (CondEltSize == 1)
17685 return Op;
17686
17687 // Variable blends are only legal from SSE4.1 onward.
17688 if (!Subtarget.hasSSE41())
17689 return SDValue();
17690
17691 SDLoc dl(Op);
17692 MVT VT = Op.getSimpleValueType();
17693 unsigned EltSize = VT.getScalarSizeInBits();
17694 unsigned NumElts = VT.getVectorNumElements();
17695
17696 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17697 // into an i1 condition so that we can use the mask-based 512-bit blend
17698 // instructions.
17699 if (VT.getSizeInBits() == 512) {
17700 // Build a mask by testing the condition against zero.
17701 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17702 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17703 DAG.getConstant(0, dl, CondVT),
17704 ISD::SETNE);
17705 // Now return a new VSELECT using the mask.
17706 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17707 }
17708
17709 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17710 if (CondEltSize != EltSize) {
17711 // If we don't have a sign splat, rely on the expansion.
17712 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17713 return SDValue();
17714
17715 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17716 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17717 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17718 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17719 }
17720
17721 // Only some types will be legal on some subtargets. If we can emit a legal
17722 // VSELECT-matching blend, return Op, and but if we need to expand, return
17723 // a null value.
17724 switch (VT.SimpleTy) {
17725 default:
17726 // Most of the vector types have blends past SSE4.1.
17727 return Op;
17728
17729 case MVT::v32i8:
17730 // The byte blends for AVX vectors were introduced only in AVX2.
17731 if (Subtarget.hasAVX2())
17732 return Op;
17733
17734 return SDValue();
17735
17736 case MVT::v8i16:
17737 case MVT::v16i16: {
17738 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17739 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17740 Cond = DAG.getBitcast(CastVT, Cond);
17741 LHS = DAG.getBitcast(CastVT, LHS);
17742 RHS = DAG.getBitcast(CastVT, RHS);
17743 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17744 return DAG.getBitcast(VT, Select);
17745 }
17746 }
17747}
17748
17749static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
17750 MVT VT = Op.getSimpleValueType();
17751 SDLoc dl(Op);
17752
17753 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
17754 return SDValue();
17755
17756 if (VT.getSizeInBits() == 8) {
17757 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
17758 Op.getOperand(0), Op.getOperand(1));
17759 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17760 }
17761
17762 if (VT == MVT::f32) {
17763 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17764 // the result back to FR32 register. It's only worth matching if the
17765 // result has a single use which is a store or a bitcast to i32. And in
17766 // the case of a store, it's not worth it if the index is a constant 0,
17767 // because a MOVSSmr can be used instead, which is smaller and faster.
17768 if (!Op.hasOneUse())
17769 return SDValue();
17770 SDNode *User = *Op.getNode()->use_begin();
17771 if ((User->getOpcode() != ISD::STORE ||
17772 isNullConstant(Op.getOperand(1))) &&
17773 (User->getOpcode() != ISD::BITCAST ||
17774 User->getValueType(0) != MVT::i32))
17775 return SDValue();
17776 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17777 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
17778 Op.getOperand(1));
17779 return DAG.getBitcast(MVT::f32, Extract);
17780 }
17781
17782 if (VT == MVT::i32 || VT == MVT::i64) {
17783 // ExtractPS/pextrq works with constant index.
17784 if (isa<ConstantSDNode>(Op.getOperand(1)))
17785 return Op;
17786 }
17787
17788 return SDValue();
17789}
17790
17791/// Extract one bit from mask vector, like v16i1 or v8i1.
17792/// AVX-512 feature.
17793static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
17794 const X86Subtarget &Subtarget) {
17795 SDValue Vec = Op.getOperand(0);
17796 SDLoc dl(Vec);
17797 MVT VecVT = Vec.getSimpleValueType();
17798 SDValue Idx = Op.getOperand(1);
17799 MVT EltVT = Op.getSimpleValueType();
17800
17801 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17802, __PRETTY_FUNCTION__))
17802 "Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17802, __PRETTY_FUNCTION__))
;
17803
17804 // variable index can't be handled in mask registers,
17805 // extend vector to VR512/128
17806 if (!isa<ConstantSDNode>(Idx)) {
17807 unsigned NumElts = VecVT.getVectorNumElements();
17808 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17809 // than extending to 128/256bit.
17810 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17811 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17812 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17813 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17814 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17815 }
17816
17817 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17818 if (IdxVal == 0) // the operation is legal
17819 return Op;
17820
17821 // Extend to natively supported kshift.
17822 unsigned NumElems = VecVT.getVectorNumElements();
17823 MVT WideVecVT = VecVT;
17824 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
17825 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17826 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
17827 DAG.getUNDEF(WideVecVT), Vec,
17828 DAG.getIntPtrConstant(0, dl));
17829 }
17830
17831 // Use kshiftr instruction to move to the lower element.
17832 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
17833 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17834
17835 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17836 DAG.getIntPtrConstant(0, dl));
17837}
17838
17839SDValue
17840X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17841 SelectionDAG &DAG) const {
17842 SDLoc dl(Op);
17843 SDValue Vec = Op.getOperand(0);
17844 MVT VecVT = Vec.getSimpleValueType();
17845 SDValue Idx = Op.getOperand(1);
17846
17847 if (VecVT.getVectorElementType() == MVT::i1)
17848 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17849
17850 if (!isa<ConstantSDNode>(Idx)) {
17851 // Its more profitable to go through memory (1 cycles throughput)
17852 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
17853 // IACA tool was used to get performance estimation
17854 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17855 //
17856 // example : extractelement <16 x i8> %a, i32 %i
17857 //
17858 // Block Throughput: 3.00 Cycles
17859 // Throughput Bottleneck: Port5
17860 //
17861 // | Num Of | Ports pressure in cycles | |
17862 // | Uops | 0 - DV | 5 | 6 | 7 | |
17863 // ---------------------------------------------
17864 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
17865 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
17866 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
17867 // Total Num Of Uops: 4
17868 //
17869 //
17870 // Block Throughput: 1.00 Cycles
17871 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17872 //
17873 // | | Ports pressure in cycles | |
17874 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
17875 // ---------------------------------------------------------
17876 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
17877 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
17878 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
17879 // Total Num Of Uops: 4
17880
17881 return SDValue();
17882 }
17883
17884 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17885
17886 // If this is a 256-bit vector result, first extract the 128-bit vector and
17887 // then extract the element from the 128-bit vector.
17888 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
17889 // Get the 128-bit vector.
17890 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
17891 MVT EltVT = VecVT.getVectorElementType();
17892
17893 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
17894 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17894, __PRETTY_FUNCTION__))
;
17895
17896 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
17897 // this can be done with a mask.
17898 IdxVal &= ElemsPerChunk - 1;
17899 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17900 DAG.getIntPtrConstant(IdxVal, dl));
17901 }
17902
17903 assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17903, __PRETTY_FUNCTION__))
;
17904
17905 MVT VT = Op.getSimpleValueType();
17906
17907 if (VT.getSizeInBits() == 16) {
17908 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
17909 // we're going to zero extend the register or fold the store (SSE41 only).
17910 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
17911 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
17912 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
17913 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17914 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17915
17916 // Transform it so it match pextrw which produces a 32-bit result.
17917 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
17918 Op.getOperand(0), Op.getOperand(1));
17919 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17920 }
17921
17922 if (Subtarget.hasSSE41())
17923 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
17924 return Res;
17925
17926 // TODO: We only extract a single element from v16i8, we can probably afford
17927 // to be more aggressive here before using the default approach of spilling to
17928 // stack.
17929 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
17930 // Extract either the lowest i32 or any i16, and extract the sub-byte.
17931 int DWordIdx = IdxVal / 4;
17932 if (DWordIdx == 0) {
17933 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17934 DAG.getBitcast(MVT::v4i32, Vec),
17935 DAG.getIntPtrConstant(DWordIdx, dl));
17936 int ShiftVal = (IdxVal % 4) * 8;
17937 if (ShiftVal != 0)
17938 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
17939 DAG.getConstant(ShiftVal, dl, MVT::i8));
17940 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17941 }
17942
17943 int WordIdx = IdxVal / 2;
17944 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17945 DAG.getBitcast(MVT::v8i16, Vec),
17946 DAG.getIntPtrConstant(WordIdx, dl));
17947 int ShiftVal = (IdxVal % 2) * 8;
17948 if (ShiftVal != 0)
17949 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17950 DAG.getConstant(ShiftVal, dl, MVT::i8));
17951 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17952 }
17953
17954 if (VT.getSizeInBits() == 32) {
17955 if (IdxVal == 0)
17956 return Op;
17957
17958 // SHUFPS the element to the lowest double word, then movss.
17959 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
17960 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17961 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17962 DAG.getIntPtrConstant(0, dl));
17963 }
17964
17965 if (VT.getSizeInBits() == 64) {
17966 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
17967 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
17968 // to match extract_elt for f64.
17969 if (IdxVal == 0)
17970 return Op;
17971
17972 // UNPCKHPD the element to the lowest double word, then movsd.
17973 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
17974 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
17975 int Mask[2] = { 1, -1 };
17976 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17977 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17978 DAG.getIntPtrConstant(0, dl));
17979 }
17980
17981 return SDValue();
17982}
17983
17984/// Insert one bit to mask vector, like v16i1 or v8i1.
17985/// AVX-512 feature.
17986static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
17987 const X86Subtarget &Subtarget) {
17988 SDLoc dl(Op);
17989 SDValue Vec = Op.getOperand(0);
17990 SDValue Elt = Op.getOperand(1);
17991 SDValue Idx = Op.getOperand(2);
17992 MVT VecVT = Vec.getSimpleValueType();
17993
17994 if (!isa<ConstantSDNode>(Idx)) {
17995 // Non constant index. Extend source and destination,
17996 // insert element and then truncate the result.
17997 unsigned NumElts = VecVT.getVectorNumElements();
17998 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17999 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18000 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18001 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18002 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18003 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18004 }
18005
18006 // Copy into a k-register, extract to v1i1 and insert_subvector.
18007 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18008
18009 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
18010 Op.getOperand(2));
18011}
18012
18013SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18014 SelectionDAG &DAG) const {
18015 MVT VT = Op.getSimpleValueType();
18016 MVT EltVT = VT.getVectorElementType();
18017 unsigned NumElts = VT.getVectorNumElements();
18018
18019 if (EltVT == MVT::i1)
18020 return InsertBitToMaskVector(Op, DAG, Subtarget);
18021
18022 SDLoc dl(Op);
18023 SDValue N0 = Op.getOperand(0);
18024 SDValue N1 = Op.getOperand(1);
18025 SDValue N2 = Op.getOperand(2);
18026
18027 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18028 if (!N2C || N2C->getAPIntValue().uge(NumElts))
18029 return SDValue();
18030 uint64_t IdxVal = N2C->getZExtValue();
18031
18032 bool IsZeroElt = X86::isZeroNode(N1);
18033 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18034
18035 // If we are inserting a element, see if we can do this more efficiently with
18036 // a blend shuffle with a rematerializable vector than a costly integer
18037 // insertion.
18038 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
18039 16 <= EltVT.getSizeInBits()) {
18040 SmallVector<int, 8> BlendMask;
18041 for (unsigned i = 0; i != NumElts; ++i)
18042 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18043 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18044 : getOnesVector(VT, DAG, dl);
18045 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18046 }
18047
18048 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18049 // into that, and then insert the subvector back into the result.
18050 if (VT.is256BitVector() || VT.is512BitVector()) {
18051 // With a 256-bit vector, we can insert into the zero element efficiently
18052 // using a blend if we have AVX or AVX2 and the right data type.
18053 if (VT.is256BitVector() && IdxVal == 0) {
18054 // TODO: It is worthwhile to cast integer to floating point and back
18055 // and incur a domain crossing penalty if that's what we'll end up
18056 // doing anyway after extracting to a 128-bit vector.
18057 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18058 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
18059 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18060 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18061 DAG.getTargetConstant(1, dl, MVT::i8));
18062 }
18063 }
18064
18065 // Get the desired 128-bit vector chunk.
18066 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18067
18068 // Insert the element into the desired chunk.
18069 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
18070 assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) :
__assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18070, __PRETTY_FUNCTION__))
;
18071 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18072 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18073
18074 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18075 DAG.getIntPtrConstant(IdxIn128, dl));
18076
18077 // Insert the changed part back into the bigger vector
18078 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18079 }
18080 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18080, __PRETTY_FUNCTION__))
;
18081
18082 // This will be just movd/movq/movss/movsd.
18083 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18084 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18085 EltVT == MVT::i64) {
18086 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18087 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18088 }
18089
18090 // We can't directly insert an i8 or i16 into a vector, so zero extend
18091 // it to i32 first.
18092 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18093 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18094 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
18095 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18096 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18097 return DAG.getBitcast(VT, N1);
18098 }
18099 }
18100
18101 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18102 // argument. SSE41 required for pinsrb.
18103 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18104 unsigned Opc;
18105 if (VT == MVT::v8i16) {
18106 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18106, __PRETTY_FUNCTION__))
;
18107 Opc = X86ISD::PINSRW;
18108 } else {
18109 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector")
? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18109, __PRETTY_FUNCTION__))
;
18110 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18110, __PRETTY_FUNCTION__))
;
18111 Opc = X86ISD::PINSRB;
18112 }
18113
18114 if (N1.getValueType() != MVT::i32)
18115 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18116 if (N2.getValueType() != MVT::i32)
18117 N2 = DAG.getIntPtrConstant(IdxVal, dl);
18118 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18119 }
18120
18121 if (Subtarget.hasSSE41()) {
18122 if (EltVT == MVT::f32) {
18123 // Bits [7:6] of the constant are the source select. This will always be
18124 // zero here. The DAG Combiner may combine an extract_elt index into
18125 // these bits. For example (insert (extract, 3), 2) could be matched by
18126 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18127 // Bits [5:4] of the constant are the destination select. This is the
18128 // value of the incoming immediate.
18129 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18130 // combine either bitwise AND or insert of float 0.0 to set these bits.
18131
18132 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18133 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
18134 // If this is an insertion of 32-bits into the low 32-bits of
18135 // a vector, we prefer to generate a blend with immediate rather
18136 // than an insertps. Blends are simpler operations in hardware and so
18137 // will always have equal or better performance than insertps.
18138 // But if optimizing for size and there's a load folding opportunity,
18139 // generate insertps because blendps does not have a 32-bit memory
18140 // operand form.
18141 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18142 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18143 DAG.getTargetConstant(1, dl, MVT::i8));
18144 }
18145 // Create this as a scalar to vector..
18146 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18147 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18148 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18149 }
18150
18151 // PINSR* works with constant index.
18152 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18153 return Op;
18154 }
18155
18156 return SDValue();
18157}
18158
18159static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
18160 SelectionDAG &DAG) {
18161 SDLoc dl(Op);
18162 MVT OpVT = Op.getSimpleValueType();
18163
18164 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18165 // combines.
18166 if (X86::isZeroNode(Op.getOperand(0)))
18167 return getZeroVector(OpVT, Subtarget, DAG, dl);
18168
18169 // If this is a 256-bit vector result, first insert into a 128-bit
18170 // vector and then insert into the 256-bit vector.
18171 if (!OpVT.is128BitVector()) {
18172 // Insert into a 128-bit vector.
18173 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18174 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
18175 OpVT.getVectorNumElements() / SizeFactor);
18176
18177 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18178
18179 // Insert the 128-bit vector.
18180 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18181 }
18182 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18183, __PRETTY_FUNCTION__))
18183 "Expected an SSE type!")((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18183, __PRETTY_FUNCTION__))
;
18184
18185 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
18186 if (OpVT == MVT::v4i32)
18187 return Op;
18188
18189 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18190 return DAG.getBitcast(
18191 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18192}
18193
18194// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18195// simple superregister reference or explicit instructions to insert
18196// the upper bits of a vector.
18197static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18198 SelectionDAG &DAG) {
18199 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ?
static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18199, __PRETTY_FUNCTION__))
;
18200
18201 return insert1BitVector(Op, DAG, Subtarget);
18202}
18203
18204static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18205 SelectionDAG &DAG) {
18206 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18207, __PRETTY_FUNCTION__))
18207 "Only vXi1 extract_subvectors need custom lowering")((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18207, __PRETTY_FUNCTION__))
;
18208
18209 SDLoc dl(Op);
18210 SDValue Vec = Op.getOperand(0);
18211 SDValue Idx = Op.getOperand(1);
18212
18213 if (!isa<ConstantSDNode>(Idx))
18214 return SDValue();
18215
18216 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18217 if (IdxVal == 0) // the operation is legal
18218 return Op;
18219
18220 MVT VecVT = Vec.getSimpleValueType();
18221 unsigned NumElems = VecVT.getVectorNumElements();
18222
18223 // Extend to natively supported kshift.
18224 MVT WideVecVT = VecVT;
18225 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18226 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18227 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18228 DAG.getUNDEF(WideVecVT), Vec,
18229 DAG.getIntPtrConstant(0, dl));
18230 }
18231
18232 // Shift to the LSB.
18233 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18234 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18235
18236 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18237 DAG.getIntPtrConstant(0, dl));
18238}
18239
18240// Returns the appropriate wrapper opcode for a global reference.
18241unsigned X86TargetLowering::getGlobalWrapperKind(
18242 const GlobalValue *GV, const unsigned char OpFlags) const {
18243 // References to absolute symbols are never PC-relative.
18244 if (GV && GV->isAbsoluteSymbolRef())
18245 return X86ISD::Wrapper;
18246
18247 CodeModel::Model M = getTargetMachine().getCodeModel();
18248 if (Subtarget.isPICStyleRIPRel() &&
18249 (M == CodeModel::Small || M == CodeModel::Kernel))
18250 return X86ISD::WrapperRIP;
18251
18252 // GOTPCREL references must always use RIP.
18253 if (OpFlags == X86II::MO_GOTPCREL)
18254 return X86ISD::WrapperRIP;
18255
18256 return X86ISD::Wrapper;
18257}
18258
18259// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18260// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18261// one of the above mentioned nodes. It has to be wrapped because otherwise
18262// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18263// be used to form addressing mode. These wrapped nodes will be selected
18264// into MOV32ri.
18265SDValue
18266X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18267 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18268
18269 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18270 // global base reg.
18271 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18272
18273 auto PtrVT = getPointerTy(DAG.getDataLayout());
18274 SDValue Result = DAG.getTargetConstantPool(
18275 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
18276 SDLoc DL(CP);
18277 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18278 // With PIC, the address is actually $g + Offset.
18279 if (OpFlag) {
18280 Result =
18281 DAG.getNode(ISD::ADD, DL, PtrVT,
18282 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18283 }
18284
18285 return Result;
18286}
18287
18288SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18289 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18290
18291 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18292 // global base reg.
18293 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18294
18295 auto PtrVT = getPointerTy(DAG.getDataLayout());
18296 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18297 SDLoc DL(JT);
18298 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18299
18300 // With PIC, the address is actually $g + Offset.
18301 if (OpFlag)
18302 Result =
18303 DAG.getNode(ISD::ADD, DL, PtrVT,
18304 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18305
18306 return Result;
18307}
18308
18309SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18310 SelectionDAG &DAG) const {
18311 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18312}
18313
18314SDValue
18315X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18316 // Create the TargetBlockAddressAddress node.
18317 unsigned char OpFlags =
18318 Subtarget.classifyBlockAddressReference();
18319 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18320 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18321 SDLoc dl(Op);
18322 auto PtrVT = getPointerTy(DAG.getDataLayout());
18323 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18324 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
18325
18326 // With PIC, the address is actually $g + Offset.
18327 if (isGlobalRelativeToPICBase(OpFlags)) {
18328 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18329 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18330 }
18331
18332 return Result;
18333}
18334
18335/// Creates target global address or external symbol nodes for calls or
18336/// other uses.
18337SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18338 bool ForCall) const {
18339 // Unpack the global address or external symbol.
18340 const SDLoc &dl = SDLoc(Op);
18341 const GlobalValue *GV = nullptr;
18342 int64_t Offset = 0;
18343 const char *ExternalSym = nullptr;
18344 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18345 GV = G->getGlobal();
18346 Offset = G->getOffset();
18347 } else {
18348 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18349 ExternalSym = ES->getSymbol();
18350 }
18351
18352 // Calculate some flags for address lowering.
18353 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
18354 unsigned char OpFlags;
18355 if (ForCall)
18356 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18357 else
18358 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18359 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18360 bool NeedsLoad = isGlobalStubReference(OpFlags);
18361
18362 CodeModel::Model M = DAG.getTarget().getCodeModel();
18363 auto PtrVT = getPointerTy(DAG.getDataLayout());
18364 SDValue Result;
18365
18366 if (GV) {
18367 // Create a target global address if this is a global. If possible, fold the
18368 // offset into the global address reference. Otherwise, ADD it on later.
18369 int64_t GlobalOffset = 0;
18370 if (OpFlags == X86II::MO_NO_FLAG &&
18371 X86::isOffsetSuitableForCodeModel(Offset, M)) {
18372 std::swap(GlobalOffset, Offset);
18373 }
18374 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18375 } else {
18376 // If this is not a global address, this must be an external symbol.
18377 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18378 }
18379
18380 // If this is a direct call, avoid the wrapper if we don't need to do any
18381 // loads or adds. This allows SDAG ISel to match direct calls.
18382 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18383 return Result;
18384
18385 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18386
18387 // With PIC, the address is actually $g + Offset.
18388 if (HasPICReg) {
18389 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18390 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18391 }
18392
18393 // For globals that require a load from a stub to get the address, emit the
18394 // load.
18395 if (NeedsLoad)
18396 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18397 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18398
18399 // If there was a non-zero offset that we didn't fold, create an explicit
18400 // addition for it.
18401 if (Offset != 0)
18402 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18403 DAG.getConstant(Offset, dl, PtrVT));
18404
18405 return Result;
18406}
18407
18408SDValue
18409X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18410 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18411}
18412
18413static SDValue
18414GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
18415 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
18416 unsigned char OperandFlags, bool LocalDynamic = false) {
18417 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18418 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18419 SDLoc dl(GA);
18420 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18421 GA->getValueType(0),
18422 GA->getOffset(),
18423 OperandFlags);
18424
18425 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
18426 : X86ISD::TLSADDR;
18427
18428 if (InFlag) {
18429 SDValue Ops[] = { Chain, TGA, *InFlag };
18430 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18431 } else {
18432 SDValue Ops[] = { Chain, TGA };
18433 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18434 }
18435
18436 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18437 MFI.setAdjustsStack(true);
18438 MFI.setHasCalls(true);
18439
18440 SDValue Flag = Chain.getValue(1);
18441 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
18442}
18443
18444// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18445static SDValue
18446LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18447 const EVT PtrVT) {
18448 SDValue InFlag;
18449 SDLoc dl(GA); // ? function entry point might be better
18450 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18451 DAG.getNode(X86ISD::GlobalBaseReg,
18452 SDLoc(), PtrVT), InFlag);
18453 InFlag = Chain.getValue(1);
18454
18455 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
18456}
18457
18458// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
18459static SDValue
18460LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18461 const EVT PtrVT) {
18462 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18463 X86::RAX, X86II::MO_TLSGD);
18464}
18465
18466static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
18467 SelectionDAG &DAG,
18468 const EVT PtrVT,
18469 bool is64Bit) {
18470 SDLoc dl(GA);
18471
18472 // Get the start address of the TLS block for this module.
18473 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
18474 .getInfo<X86MachineFunctionInfo>();
18475 MFI->incNumLocalDynamicTLSAccesses();
18476
18477 SDValue Base;
18478 if (is64Bit) {
18479 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
18480 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18481 } else {
18482 SDValue InFlag;
18483 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18484 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
18485 InFlag = Chain.getValue(1);
18486 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
18487 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18488 }
18489
18490 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18491 // of Base.
18492
18493 // Build x@dtpoff.
18494 unsigned char OperandFlags = X86II::MO_DTPOFF;
18495 unsigned WrapperKind = X86ISD::Wrapper;
18496 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18497 GA->getValueType(0),
18498 GA->getOffset(), OperandFlags);
18499 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18500
18501 // Add x@dtpoff with the base.
18502 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18503}
18504
18505// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18506static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18507 const EVT PtrVT, TLSModel::Model model,
18508 bool is64Bit, bool isPIC) {
18509 SDLoc dl(GA);
18510
18511 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18512 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
18513 is64Bit ? 257 : 256));
18514
18515 SDValue ThreadPointer =
18516 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18517 MachinePointerInfo(Ptr));
18518
18519 unsigned char OperandFlags = 0;
18520 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18521 // initialexec.
18522 unsigned WrapperKind = X86ISD::Wrapper;
18523 if (model == TLSModel::LocalExec) {
18524 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18525 } else if (model == TLSModel::InitialExec) {
18526 if (is64Bit) {
18527 OperandFlags = X86II::MO_GOTTPOFF;
18528 WrapperKind = X86ISD::WrapperRIP;
18529 } else {
18530 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18531 }
18532 } else {
18533 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18533)
;
18534 }
18535
18536 // emit "addl x@ntpoff,%eax" (local exec)
18537 // or "addl x@indntpoff,%eax" (initial exec)
18538 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18539 SDValue TGA =
18540 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18541 GA->getOffset(), OperandFlags);
18542 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18543
18544 if (model == TLSModel::InitialExec) {
18545 if (isPIC && !is64Bit) {
18546 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18547 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18548 Offset);
18549 }
18550
18551 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18552 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18553 }
18554
18555 // The address of the thread local variable is the add of the thread
18556 // pointer with the offset of the variable.
18557 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18558}
18559
18560SDValue
18561X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18562
18563 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18564
18565 if (DAG.getTarget().useEmulatedTLS())
18566 return LowerToTLSEmulatedModel(GA, DAG);
18567
18568 const GlobalValue *GV = GA->getGlobal();
18569 auto PtrVT = getPointerTy(DAG.getDataLayout());
18570 bool PositionIndependent = isPositionIndependent();
18571
18572 if (Subtarget.isTargetELF()) {
18573 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18574 switch (model) {
18575 case TLSModel::GeneralDynamic:
18576 if (Subtarget.is64Bit())
18577 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18578 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18579 case TLSModel::LocalDynamic:
18580 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
18581 Subtarget.is64Bit());
18582 case TLSModel::InitialExec:
18583 case TLSModel::LocalExec:
18584 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18585 PositionIndependent);
18586 }
18587 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18587)
;
18588 }
18589
18590 if (Subtarget.isTargetDarwin()) {
18591 // Darwin only has one model of TLS. Lower to that.
18592 unsigned char OpFlag = 0;
18593 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
18594 X86ISD::WrapperRIP : X86ISD::Wrapper;
18595
18596 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18597 // global base reg.
18598 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18599 if (PIC32)
18600 OpFlag = X86II::MO_TLVP_PIC_BASE;
18601 else
18602 OpFlag = X86II::MO_TLVP;
18603 SDLoc DL(Op);
18604 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18605 GA->getValueType(0),
18606 GA->getOffset(), OpFlag);
18607 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18608
18609 // With PIC32, the address is actually $g + Offset.
18610 if (PIC32)
18611 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18612 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18613 Offset);
18614
18615 // Lowering the machine isd will make sure everything is in the right
18616 // location.
18617 SDValue Chain = DAG.getEntryNode();
18618 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18619 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18620 SDValue Args[] = { Chain, Offset };
18621 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18622 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
18623 DAG.getIntPtrConstant(0, DL, true),
18624 Chain.getValue(1), DL);
18625
18626 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18627 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18628 MFI.setAdjustsStack(true);
18629
18630 // And our return value (tls address) is in the standard call return value
18631 // location.
18632 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18633 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18634 }
18635
18636 if (Subtarget.isOSWindows()) {
18637 // Just use the implicit TLS architecture
18638 // Need to generate something similar to:
18639 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18640 // ; from TEB
18641 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18642 // mov rcx, qword [rdx+rcx*8]
18643 // mov eax, .tls$:tlsvar
18644 // [rax+rcx] contains the address
18645 // Windows 64bit: gs:0x58
18646 // Windows 32bit: fs:__tls_array
18647
18648 SDLoc dl(GA);
18649 SDValue Chain = DAG.getEntryNode();
18650
18651 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18652 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18653 // use its literal value of 0x2C.
18654 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
18655 ? Type::getInt8PtrTy(*DAG.getContext(),
18656 256)
18657 : Type::getInt32PtrTy(*DAG.getContext(),
18658 257));
18659
18660 SDValue TlsArray = Subtarget.is64Bit()
18661 ? DAG.getIntPtrConstant(0x58, dl)
18662 : (Subtarget.isTargetWindowsGNU()
18663 ? DAG.getIntPtrConstant(0x2C, dl)
18664 : DAG.getExternalSymbol("_tls_array", PtrVT));
18665
18666 SDValue ThreadPointer =
18667 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18668
18669 SDValue res;
18670 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18671 res = ThreadPointer;
18672 } else {
18673 // Load the _tls_index variable
18674 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18675 if (Subtarget.is64Bit())
18676 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18677 MachinePointerInfo(), MVT::i32);
18678 else
18679 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18680
18681 auto &DL = DAG.getDataLayout();
18682 SDValue Scale =
18683 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18684 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18685
18686 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18687 }
18688
18689 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18690
18691 // Get the offset of start of .tls section
18692 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18693 GA->getValueType(0),
18694 GA->getOffset(), X86II::MO_SECREL);
18695 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18696
18697 // The address of the thread local variable is the add of the thread
18698 // pointer with the offset of the variable.
18699 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18700 }
18701
18702 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18702)
;
18703}
18704
18705/// Lower SRA_PARTS and friends, which return two i32 values
18706/// and take a 2 x i32 value to shift plus a shift amount.
18707/// TODO: Can this be moved to general expansion code?
18708static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
18709 assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18709, __PRETTY_FUNCTION__))
;
18710 MVT VT = Op.getSimpleValueType();
18711 unsigned VTBits = VT.getSizeInBits();
18712 SDLoc dl(Op);
18713 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
18714 SDValue ShOpLo = Op.getOperand(0);
18715 SDValue ShOpHi = Op.getOperand(1);
18716 SDValue ShAmt = Op.getOperand(2);
18717 // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
18718 // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
18719 // during isel.
18720 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18721 DAG.getConstant(VTBits - 1, dl, MVT::i8));
18722 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
18723 DAG.getConstant(VTBits - 1, dl, MVT::i8))
18724 : DAG.getConstant(0, dl, VT);
18725
18726 SDValue Tmp2, Tmp3;
18727 if (Op.getOpcode() == ISD::SHL_PARTS) {
18728 Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
18729 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
18730 } else {
18731 Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
18732 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
18733 }
18734
18735 // If the shift amount is larger or equal than the width of a part we can't
18736 // rely on the results of shld/shrd. Insert a test and select the appropriate
18737 // values for large shift amounts.
18738 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18739 DAG.getConstant(VTBits, dl, MVT::i8));
18740 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
18741 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
18742
18743 SDValue Hi, Lo;
18744 if (Op.getOpcode() == ISD::SHL_PARTS) {
18745 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18746 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18747 } else {
18748 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18749 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18750 }
18751
18752 return DAG.getMergeValues({ Lo, Hi }, dl);
18753}
18754
18755static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
18756 SelectionDAG &DAG) {
18757 MVT VT = Op.getSimpleValueType();
18758 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18759, __PRETTY_FUNCTION__))
18759 "Unexpected funnel shift opcode!")(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18759, __PRETTY_FUNCTION__))
;
18760
18761 SDLoc DL(Op);
18762 SDValue Op0 = Op.getOperand(0);
18763 SDValue Op1 = Op.getOperand(1);
18764 SDValue Amt = Op.getOperand(2);
18765
18766 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
18767
18768 if (VT.isVector()) {
18769 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((Subtarget.hasVBMI2() && "Expected VBMI2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18769, __PRETTY_FUNCTION__))
;
18770
18771 if (IsFSHR)
18772 std::swap(Op0, Op1);
18773
18774 APInt APIntShiftAmt;
18775 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
18776 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
18777 return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
18778 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18779 }
18780
18781 return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
18782 Op0, Op1, Amt);
18783 }
18784
18785 assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18786, __PRETTY_FUNCTION__))
18786 "Unexpected funnel shift type!")(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18786, __PRETTY_FUNCTION__))
;
18787
18788 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
18789 bool OptForSize = DAG.shouldOptForSize();
18790 if (!OptForSize && Subtarget.isSHLDSlow())
18791 return SDValue();
18792
18793 if (IsFSHR)
18794 std::swap(Op0, Op1);
18795
18796 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
18797 if (VT == MVT::i16)
18798 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
18799 DAG.getConstant(15, DL, Amt.getValueType()));
18800
18801 unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
18802 return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
18803}
18804
18805// Try to use a packed vector operation to handle i64 on 32-bit targets when
18806// AVX512DQ is enabled.
18807static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
18808 const X86Subtarget &Subtarget) {
18809 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18813, __PRETTY_FUNCTION__))
18810 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18813, __PRETTY_FUNCTION__))
18811 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18813, __PRETTY_FUNCTION__))
18812 Op.getOpcode() == ISD::UINT_TO_FP) &&(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18813, __PRETTY_FUNCTION__))
18813 "Unexpected opcode!")(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18813, __PRETTY_FUNCTION__))
;
18814 bool IsStrict = Op->isStrictFPOpcode();
18815 unsigned OpNo = IsStrict ? 1 : 0;
18816 SDValue Src = Op.getOperand(OpNo);
18817 MVT SrcVT = Src.getSimpleValueType();
18818 MVT VT = Op.getSimpleValueType();
18819
18820 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18821 (VT != MVT::f32 && VT != MVT::f64))
18822 return SDValue();
18823
18824 // Pack the i64 into a vector, do the operation and extract.
18825
18826 // Using 256-bit to ensure result is 128-bits for f32 case.
18827 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18828 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18829 MVT VecVT = MVT::getVectorVT(VT, NumElts);
18830
18831 SDLoc dl(Op);
18832 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18833 if (IsStrict) {
18834 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
18835 {Op.getOperand(0), InVec});
18836 SDValue Chain = CvtVec.getValue(1);
18837 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18838 DAG.getIntPtrConstant(0, dl));
18839 return DAG.getMergeValues({Value, Chain}, dl);
18840 }
18841
18842 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18843
18844 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18845 DAG.getIntPtrConstant(0, dl));
18846}
18847
18848static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
18849 const X86Subtarget &Subtarget) {
18850 switch (Opcode) {
18851 case ISD::SINT_TO_FP:
18852 // TODO: Handle wider types with AVX/AVX512.
18853 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
18854 return false;
18855 // CVTDQ2PS or (V)CVTDQ2PD
18856 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
18857
18858 case ISD::UINT_TO_FP:
18859 // TODO: Handle wider types and i64 elements.
18860 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
18861 return false;
18862 // VCVTUDQ2PS or VCVTUDQ2PD
18863 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
18864
18865 default:
18866 return false;
18867 }
18868}
18869
18870/// Given a scalar cast operation that is extracted from a vector, try to
18871/// vectorize the cast op followed by extraction. This will avoid an expensive
18872/// round-trip between XMM and GPR.
18873static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
18874 const X86Subtarget &Subtarget) {
18875 // TODO: This could be enhanced to handle smaller integer types by peeking
18876 // through an extend.
18877 SDValue Extract = Cast.getOperand(0);
18878 MVT DestVT = Cast.getSimpleValueType();
18879 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18880 !isa<ConstantSDNode>(Extract.getOperand(1)))
18881 return SDValue();
18882
18883 // See if we have a 128-bit vector cast op for this type of cast.
18884 SDValue VecOp = Extract.getOperand(0);
18885 MVT FromVT = VecOp.getSimpleValueType();
18886 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
18887 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
18888 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
18889 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
18890 return SDValue();
18891
18892 // If we are extracting from a non-zero element, first shuffle the source
18893 // vector to allow extracting from element zero.
18894 SDLoc DL(Cast);
18895 if (!isNullConstant(Extract.getOperand(1))) {
18896 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
18897 Mask[0] = Extract.getConstantOperandVal(1);
18898 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
18899 }
18900 // If the source vector is wider than 128-bits, extract the low part. Do not
18901 // create an unnecessarily wide vector cast op.
18902 if (FromVT != Vec128VT)
18903 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
18904
18905 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
18906 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
18907 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
18908 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
18909 DAG.getIntPtrConstant(0, DL));
18910}
18911
18912static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
18913 const X86Subtarget &Subtarget) {
18914 SDLoc DL(Op);
18915 bool IsStrict = Op->isStrictFPOpcode();
18916 MVT VT = Op->getSimpleValueType(0);
18917 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
18918
18919 if (Subtarget.hasDQI()) {
18920 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18920, __PRETTY_FUNCTION__))
;
18921
18922 assert((Src.getSimpleValueType() == MVT::v2i64 ||(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18924, __PRETTY_FUNCTION__))
18923 Src.getSimpleValueType() == MVT::v4i64) &&(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18924, __PRETTY_FUNCTION__))
18924 "Unsupported custom type")(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18924, __PRETTY_FUNCTION__))
;
18925
18926 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
18927 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18928, __PRETTY_FUNCTION__))
18928 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18928, __PRETTY_FUNCTION__))
;
18929 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
18930
18931 // Need to concat with zero vector for strict fp to avoid spurious
18932 // exceptions.
18933 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
18934 : DAG.getUNDEF(MVT::v8i64);
18935 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
18936 DAG.getIntPtrConstant(0, DL));
18937 SDValue Res, Chain;
18938 if (IsStrict) {
18939 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
18940 {Op->getOperand(0), Src});
18941 Chain = Res.getValue(1);
18942 } else {
18943 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
18944 }
18945
18946 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18947 DAG.getIntPtrConstant(0, DL));
18948
18949 if (IsStrict)
18950 return DAG.getMergeValues({Res, Chain}, DL);
18951 return Res;
18952 }
18953
18954 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
18955 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
18956 if (VT != MVT::v4f32 || IsSigned)
18957 return SDValue();
18958
18959 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
18960 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
18961 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
18962 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
18963 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
18964 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
18965 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
18966 SmallVector<SDValue, 4> SignCvts(4);
18967 SmallVector<SDValue, 4> Chains(4);
18968 for (int i = 0; i != 4; ++i) {
18969 SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
18970 DAG.getIntPtrConstant(i, DL));
18971 if (IsStrict) {
18972 SignCvts[i] =
18973 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
18974 {Op.getOperand(0), Src});
18975 Chains[i] = SignCvts[i].getValue(1);
18976 } else {
18977 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
18978 }
18979 }
18980 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
18981
18982 SDValue Slow, Chain;
18983 if (IsStrict) {
18984 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18985 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
18986 {Chain, SignCvt, SignCvt});
18987 Chain = Slow.getValue(1);
18988 } else {
18989 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
18990 }
18991
18992 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
18993 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
18994
18995 if (IsStrict)
18996 return DAG.getMergeValues({Cvt, Chain}, DL);
18997
18998 return Cvt;
18999}
19000
19001SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19002 SelectionDAG &DAG) const {
19003 bool IsStrict = Op->isStrictFPOpcode();
19004 unsigned OpNo = IsStrict ? 1 : 0;
19005 SDValue Src = Op.getOperand(OpNo);
19006 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19007 MVT SrcVT = Src.getSimpleValueType();
19008 MVT VT = Op.getSimpleValueType();
19009 SDLoc dl(Op);
19010
19011 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19012 return Extract;
19013
19014 if (SrcVT.isVector()) {
19015 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19016 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19017 // source for strict FP.
19018 if (IsStrict)
19019 return DAG.getNode(
19020 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19021 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19022 DAG.getUNDEF(SrcVT))});
19023 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19024 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19025 DAG.getUNDEF(SrcVT)));
19026 }
19027 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19028 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19029
19030 return SDValue();
19031 }
19032
19033 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19034, __PRETTY_FUNCTION__))
19034 "Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19034, __PRETTY_FUNCTION__))
;
19035
19036 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19037
19038 // These are really Legal; return the operand so the caller accepts it as
19039 // Legal.
19040 if (SrcVT == MVT::i32 && UseSSEReg)
19041 return Op;
19042 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19043 return Op;
19044
19045 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19046 return V;
19047
19048 // SSE doesn't have an i16 conversion so we need to promote.
19049 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19050 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19051 if (IsStrict)
19052 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19053 {Chain, Ext});
19054
19055 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19056 }
19057
19058 if (VT == MVT::f128)
19059 return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
19060
19061 SDValue ValueToStore = Src;
19062 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19063 // Bitcasting to f64 here allows us to do a single 64-bit store from
19064 // an SSE register, avoiding the store forwarding penalty that would come
19065 // with two 32-bit stores.
19066 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19067
19068 unsigned Size = SrcVT.getSizeInBits()/8;
19069 MachineFunction &MF = DAG.getMachineFunction();
19070 auto PtrVT = getPointerTy(MF.getDataLayout());
19071 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
19072 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19073 Chain = DAG.getStore(
19074 Chain, dl, ValueToStore, StackSlot,
19075 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19076 std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
19077
19078 if (IsStrict)
19079 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19080
19081 return Tmp.first;
19082}
19083
19084std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
19085 SDValue StackSlot,
19086 SelectionDAG &DAG) const {
19087 // Build the FILD
19088 SDLoc DL(Op);
19089 SDVTList Tys;
19090 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
19091 if (useSSE)
19092 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19093 else
19094 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
19095
19096 unsigned ByteSize = SrcVT.getSizeInBits() / 8;
19097
19098 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
19099 MachineMemOperand *LoadMMO;
19100 if (FI) {
19101 int SSFI = FI->getIndex();
19102 LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
19103 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19104 MachineMemOperand::MOLoad, ByteSize, ByteSize);
19105 } else {
19106 LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
19107 StackSlot = StackSlot.getOperand(1);
19108 }
19109 SDValue FILDOps[] = {Chain, StackSlot};
19110 SDValue Result =
19111 DAG.getMemIntrinsicNode(X86ISD::FILD, DL,
19112 Tys, FILDOps, SrcVT, LoadMMO);
19113 Chain = Result.getValue(1);
19114
19115 if (useSSE) {
19116 MachineFunction &MF = DAG.getMachineFunction();
19117 unsigned SSFISize = Op.getValueSizeInBits() / 8;
19118 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
19119 auto PtrVT = getPointerTy(MF.getDataLayout());
19120 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19121 Tys = DAG.getVTList(MVT::Other);
19122 SDValue FSTOps[] = {Chain, Result, StackSlot};
19123 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
19124 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19125 MachineMemOperand::MOStore, SSFISize, SSFISize);
19126
19127 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
19128 Op.getValueType(), StoreMMO);
19129 Result = DAG.getLoad(
19130 Op.getValueType(), DL, Chain, StackSlot,
19131 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19132 Chain = Result.getValue(1);
19133 }
19134
19135 return { Result, Chain };
19136}
19137
19138/// Horizontal vector math instructions may be slower than normal math with
19139/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19140/// implementation, and likely shuffle complexity of the alternate sequence.
19141static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19142 const X86Subtarget &Subtarget) {
19143 bool IsOptimizingSize = DAG.shouldOptForSize();
19144 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19145 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19146}
19147
19148/// 64-bit unsigned integer to double expansion.
19149static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
19150 const X86Subtarget &Subtarget) {
19151 // This algorithm is not obvious. Here it is what we're trying to output:
19152 /*
19153 movq %rax, %xmm0
19154 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19155 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19156 #ifdef __SSE3__
19157 haddpd %xmm0, %xmm0
19158 #else
19159 pshufd $0x4e, %xmm0, %xmm1
19160 addpd %xmm1, %xmm0
19161 #endif
19162 */
19163
19164 bool IsStrict = Op->isStrictFPOpcode();
19165 unsigned OpNo = IsStrict ? 1 : 0;
19166 SDLoc dl(Op);
19167 LLVMContext *Context = DAG.getContext();
19168
19169 // Build some magic constants.
19170 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19171 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19172 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19173 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
19174
19175 SmallVector<Constant*,2> CV1;
19176 CV1.push_back(
19177 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19178 APInt(64, 0x4330000000000000ULL))));
19179 CV1.push_back(
19180 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19181 APInt(64, 0x4530000000000000ULL))));
19182 Constant *C1 = ConstantVector::get(CV1);
19183 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
19184
19185 // Load the 64-bit value into an XMM register.
19186 SDValue XR1 =
19187 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
19188 SDValue CLod0 =
19189 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19190 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19191 /* Alignment = */ 16);
19192 SDValue Unpck1 =
19193 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19194
19195 SDValue CLod1 =
19196 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19197 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19198 /* Alignment = */ 16);
19199 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19200 SDValue Sub;
19201 SDValue Chain;
19202 // TODO: Are there any fast-math-flags to propagate here?
19203 if (IsStrict) {
19204 Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
19205 {Op.getOperand(0), XR2F, CLod1});
19206 Chain = Sub.getValue(1);
19207 } else
19208 Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19209 SDValue Result;
19210
19211 if (!IsStrict && Subtarget.hasSSE3() &&
19212 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19213 // FIXME: Do we need a STRICT version of FHADD?
19214 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19215 } else {
19216 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19217 if (IsStrict) {
19218 Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
19219 {Chain, Shuffle, Sub});
19220 Chain = Result.getValue(1);
19221 } else
19222 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19223 }
19224 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19225 DAG.getIntPtrConstant(0, dl));
19226 if (IsStrict)
19227 return DAG.getMergeValues({Result, Chain}, dl);
19228
19229 return Result;
19230}
19231
19232/// 32-bit unsigned integer to float expansion.
19233static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
19234 const X86Subtarget &Subtarget) {
19235 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19236 SDLoc dl(Op);
19237 // FP constant to bias correct the final result.
19238 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
19239 MVT::f64);
19240
19241 // Load the 32-bit value into an XMM register.
19242 SDValue Load =
19243 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19244
19245 // Zero out the upper parts of the register.
19246 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19247
19248 // Or the load with the bias.
19249 SDValue Or = DAG.getNode(
19250 ISD::OR, dl, MVT::v2i64,
19251 DAG.getBitcast(MVT::v2i64, Load),
19252 DAG.getBitcast(MVT::v2i64,
19253 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19254 Or =
19255 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19256 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19257
19258 if (Op.getNode()->isStrictFPOpcode()) {
19259 // Subtract the bias.
19260 // TODO: Are there any fast-math-flags to propagate here?
19261 SDValue Chain = Op.getOperand(0);
19262 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19263 {Chain, Or, Bias});
19264
19265 if (Op.getValueType() == Sub.getValueType())
19266 return Sub;
19267
19268 // Handle final rounding.
19269 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19270 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19271
19272 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19273 }
19274
19275 // Subtract the bias.
19276 // TODO: Are there any fast-math-flags to propagate here?
19277 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19278
19279 // Handle final rounding.
19280 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19281}
19282
19283static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
19284 const X86Subtarget &Subtarget,
19285 const SDLoc &DL) {
19286 if (Op.getSimpleValueType() != MVT::v2f64)
19287 return SDValue();
19288
19289 bool IsStrict = Op->isStrictFPOpcode();
19290
19291 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19292 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19292, __PRETTY_FUNCTION__))
;
19293
19294 if (Subtarget.hasAVX512()) {
19295 if (!Subtarget.hasVLX()) {
19296 // Let generic type legalization widen this.
19297 if (!IsStrict)
19298 return SDValue();
19299 // Otherwise pad the integer input with 0s and widen the operation.
19300 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19301 DAG.getConstant(0, DL, MVT::v2i32));
19302 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19303 {Op.getOperand(0), N0});
19304 SDValue Chain = Res.getValue(1);
19305 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19306 DAG.getIntPtrConstant(0, DL));
19307 return DAG.getMergeValues({Res, Chain}, DL);
19308 }
19309
19310 // Legalize to v4i32 type.
19311 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19312 DAG.getUNDEF(MVT::v2i32));
19313 if (IsStrict)
19314 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19315 {Op.getOperand(0), N0});
19316 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19317 }
19318
19319 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19320 // This gives us the floating point equivalent of 2^52 + the i32 integer
19321 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19322 // point leaving just our i32 integers in double format.
19323 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19324 SDValue VBias =
19325 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
19326 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19327 DAG.getBitcast(MVT::v2i64, VBias));
19328 Or = DAG.getBitcast(MVT::v2f64, Or);
19329
19330 if (IsStrict)
19331 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19332 {Op.getOperand(0), Or, VBias});
19333 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19334}
19335
19336static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
19337 const X86Subtarget &Subtarget) {
19338 SDLoc DL(Op);
19339 bool IsStrict = Op->isStrictFPOpcode();
19340 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19341 MVT VecIntVT = V.getSimpleValueType();
19342 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19343, __PRETTY_FUNCTION__))
19343 "Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19343, __PRETTY_FUNCTION__))
;
19344
19345 if (Subtarget.hasAVX512()) {
19346 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19347 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19347, __PRETTY_FUNCTION__))
;
19348 MVT VT = Op->getSimpleValueType(0);
19349
19350 // v8i32->v8f64 is legal with AVX512 so just return it.
19351 if (VT == MVT::v8f64)
19352 return Op;
19353
19354 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19355, __PRETTY_FUNCTION__))
19355 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19355, __PRETTY_FUNCTION__))
;
19356 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19357 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19358 // Need to concat with zero vector for strict fp to avoid spurious
19359 // exceptions.
19360 SDValue Tmp =
19361 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19362 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19363 DAG.getIntPtrConstant(0, DL));
19364 SDValue Res, Chain;
19365 if (IsStrict) {
19366 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19367 {Op->getOperand(0), V});
19368 Chain = Res.getValue(1);
19369 } else {
19370 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19371 }
19372
19373 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19374 DAG.getIntPtrConstant(0, DL));
19375
19376 if (IsStrict)
19377 return DAG.getMergeValues({Res, Chain}, DL);
19378 return Res;
19379 }
19380
19381 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19382 Op->getSimpleValueType(0) == MVT::v4f64) {
19383 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19384 Constant *Bias = ConstantFP::get(
19385 *DAG.getContext(),
19386 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19387 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19388 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
19389 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19390 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19391 SDValue VBias = DAG.getMemIntrinsicNode(
19392 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19393 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19394 /*Alignment*/ 8, MachineMemOperand::MOLoad);
19395
19396 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19397 DAG.getBitcast(MVT::v4i64, VBias));
19398 Or = DAG.getBitcast(MVT::v4f64, Or);
19399
19400 if (IsStrict)
19401 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19402 {Op.getOperand(0), Or, VBias});
19403 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19404 }
19405
19406 // The algorithm is the following:
19407 // #ifdef __SSE4_1__
19408 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19409 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19410 // (uint4) 0x53000000, 0xaa);
19411 // #else
19412 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19413 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19414 // #endif
19415 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19416 // return (float4) lo + fhi;
19417
19418 bool Is128 = VecIntVT == MVT::v4i32;
19419 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19420 // If we convert to something else than the supported type, e.g., to v4f64,
19421 // abort early.
19422 if (VecFloatVT != Op->getSimpleValueType(0))
19423 return SDValue();
19424
19425 // In the #idef/#else code, we have in common:
19426 // - The vector of constants:
19427 // -- 0x4b000000
19428 // -- 0x53000000
19429 // - A shift:
19430 // -- v >> 16
19431
19432 // Create the splat vector for 0x4b000000.
19433 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19434 // Create the splat vector for 0x53000000.
19435 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19436
19437 // Create the right shift.
19438 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19439 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19440
19441 SDValue Low, High;
19442 if (Subtarget.hasSSE41()) {
19443 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19444 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19445 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19446 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19447 // Low will be bitcasted right away, so do not bother bitcasting back to its
19448 // original type.
19449 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19450 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19451 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19452 // (uint4) 0x53000000, 0xaa);
19453 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19454 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19455 // High will be bitcasted right away, so do not bother bitcasting back to
19456 // its original type.
19457 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19458 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19459 } else {
19460 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19461 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19462 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19463 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19464
19465 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19466 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19467 }
19468
19469 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19470 SDValue VecCstFSub = DAG.getConstantFP(
19471 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19472
19473 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19474 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19475 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19476 // enabled. See PR24512.
19477 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19478 // TODO: Are there any fast-math-flags to propagate here?
19479 // (float4) lo;
19480 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19481 // return (float4) lo + fhi;
19482 if (IsStrict) {
19483 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19484 {Op.getOperand(0), HighBitcast, VecCstFSub});
19485 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19486 {FHigh.getValue(1), LowBitcast, FHigh});
19487 }
19488
19489 SDValue FHigh =
19490 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19491 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19492}
19493
19494static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
19495 const X86Subtarget &Subtarget) {
19496 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19497 SDValue N0 = Op.getOperand(OpNo);
19498 MVT SrcVT = N0.getSimpleValueType();
19499 SDLoc dl(Op);
19500
19501 switch (SrcVT.SimpleTy) {
19502 default:
19503 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19503)
;
19504 case MVT::v2i32:
19505 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
19506 case MVT::v4i32:
19507 case MVT::v8i32:
19508 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
19509 case MVT::v2i64:
19510 case MVT::v4i64:
19511 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19512 }
19513}
19514
19515SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19516 SelectionDAG &DAG) const {
19517 bool IsStrict = Op->isStrictFPOpcode();
19518 unsigned OpNo = IsStrict ? 1 : 0;
19519 SDValue Src = Op.getOperand(OpNo);
19520 SDLoc dl(Op);
19521 auto PtrVT = getPointerTy(DAG.getDataLayout());
19522 MVT SrcVT = Src.getSimpleValueType();
19523 MVT DstVT = Op->getSimpleValueType(0);
19524 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19525
19526 if (DstVT == MVT::f128)
19527 return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
19528
19529 if (DstVT.isVector())
19530 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
19531
19532 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19533 return Extract;
19534
19535 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19536 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19537 // Conversions from unsigned i32 to f32/f64 are legal,
19538 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19539 return Op;
19540 }
19541
19542 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19543 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19544 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19545 if (IsStrict)
19546 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19547 {Chain, Src});
19548 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19549 }
19550
19551 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19552 return V;
19553
19554 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
19555 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
19556 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
19557 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
19558 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
19559 return SDValue();
19560
19561 // Make a 64-bit buffer, and use it to build an FILD.
19562 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
19563 if (SrcVT == MVT::i32) {
19564 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
19565 SDValue Store1 =
19566 DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
19567 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19568 OffsetSlot, MachinePointerInfo());
19569 std::pair<SDValue, SDValue> Tmp =
19570 BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
19571 if (IsStrict)
19572 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19573
19574 return Tmp.first;
19575 }
19576
19577 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19577, __PRETTY_FUNCTION__))
;
19578 SDValue ValueToStore = Src;
19579 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19580 // Bitcasting to f64 here allows us to do a single 64-bit store from
19581 // an SSE register, avoiding the store forwarding penalty that would come
19582 // with two 32-bit stores.
19583 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19584 }
19585 SDValue Store =
19586 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
19587 // For i64 source, we need to add the appropriate power of 2 if the input
19588 // was negative. This is the same as the optimization in
19589 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
19590 // we must be careful to do the computation in x87 extended precision, not
19591 // in SSE. (The generic code can't know it's OK to do this, or how to.)
19592 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19593 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
19594 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19595 MachineMemOperand::MOLoad, 8, 8);
19596
19597 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19598 SDValue Ops[] = { Store, StackSlot };
19599 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
19600 MVT::i64, MMO);
19601 Chain = Fild.getValue(1);
19602
19603
19604 // Check whether the sign bit is set.
19605 SDValue SignSet = DAG.getSetCC(
19606 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19607 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19608
19609 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19610 APInt FF(64, 0x5F80000000000000ULL);
19611 SDValue FudgePtr = DAG.getConstantPool(
19612 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19613
19614 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19615 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19616 SDValue Four = DAG.getIntPtrConstant(4, dl);
19617 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19618 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19619
19620 // Load the value out, extending it from f32 to f80.
19621 SDValue Fudge = DAG.getExtLoad(
19622 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19623 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
19624 /* Alignment = */ 4);
19625 Chain = Fudge.getValue(1);
19626 // Extend everything to 80 bits to force it to be done on x87.
19627 // TODO: Are there any fast-math-flags to propagate here?
19628 if (IsStrict) {
19629 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
19630 {Chain, Fild, Fudge});
19631 // STRICT_FP_ROUND can't handle equal types.
19632 if (DstVT == MVT::f80)
19633 return Add;
19634 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19635 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19636 }
19637 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
19638 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19639 DAG.getIntPtrConstant(0, dl));
19640}
19641
19642// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19643// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19644// just return an SDValue().
19645// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19646// to i16, i32 or i64, and we lower it to a legal sequence and return the
19647// result.
19648SDValue
19649X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19650 bool IsSigned, SDValue &Chain) const {
19651 bool IsStrict = Op->isStrictFPOpcode();
19652 SDLoc DL(Op);
19653
19654 EVT DstTy = Op.getValueType();
19655 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19656 EVT TheVT = Value.getValueType();
19657 auto PtrVT = getPointerTy(DAG.getDataLayout());
19658
19659 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19660 // f16 must be promoted before using the lowering in this routine.
19661 // fp128 does not use this lowering.
19662 return SDValue();
19663 }
19664
19665 // If using FIST to compute an unsigned i64, we'll need some fixup
19666 // to handle values above the maximum signed i64. A FIST is always
19667 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19668 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19669
19670 // FIXME: This does not generate an invalid exception if the input does not
19671 // fit in i32. PR44019
19672 if (!IsSigned && DstTy != MVT::i64) {
19673 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19674 // The low 32 bits of the fist result will have the correct uint32 result.
19675 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19675, __PRETTY_FUNCTION__))
;
19676 DstTy = MVT::i64;
19677 }
19678
19679 assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19681, __PRETTY_FUNCTION__))
19680 DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19681, __PRETTY_FUNCTION__))
19681 "Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19681, __PRETTY_FUNCTION__))
;
19682
19683 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19684 // stack slot.
19685 MachineFunction &MF = DAG.getMachineFunction();
19686 unsigned MemSize = DstTy.getStoreSize();
19687 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
19688 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19689
19690 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19691
19692 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
19693
19694 if (UnsignedFixup) {
19695 //
19696 // Conversion to unsigned i64 is implemented with a select,
19697 // depending on whether the source value fits in the range
19698 // of a signed i64. Let Thresh be the FP equivalent of
19699 // 0x8000000000000000ULL.
19700 //
19701 // Adjust = (Value < Thresh) ? 0 : 0x80000000;
19702 // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
19703 // FistSrc = (Value - FltOfs);
19704 // Fist-to-mem64 FistSrc
19705 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
19706 // to XOR'ing the high 32 bits with Adjust.
19707 //
19708 // Being a power of 2, Thresh is exactly representable in all FP formats.
19709 // For X87 we'd like to use the smallest FP type for this constant, but
19710 // for DAG type consistency we have to match the FP operand type.
19711
19712 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
19713 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
19714 bool LosesInfo = false;
19715 if (TheVT == MVT::f64)
19716 // The rounding mode is irrelevant as the conversion should be exact.
19717 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
19718 &LosesInfo);
19719 else if (TheVT == MVT::f80)
19720 Status = Thresh.convert(APFloat::x87DoubleExtended(),
19721 APFloat::rmNearestTiesToEven, &LosesInfo);
19722
19723 assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19724, __PRETTY_FUNCTION__))
19724 "FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19724, __PRETTY_FUNCTION__))
;
19725
19726 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
19727
19728 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
19729 *DAG.getContext(), TheVT);
19730 SDValue Cmp;
19731 if (IsStrict) {
19732 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
19733 Chain, /*IsSignaling*/ true);
19734 Chain = Cmp.getValue(1);
19735 } else {
19736 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
19737 }
19738
19739 Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
19740 DAG.getConstant(0, DL, MVT::i64),
19741 DAG.getConstant(APInt::getSignMask(64),
19742 DL, MVT::i64));
19743 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
19744 DAG.getConstantFP(0.0, DL, TheVT),
19745 ThreshVal);
19746
19747 if (IsStrict) {
19748 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
19749 { Chain, Value, FltOfs });
19750 Chain = Value.getValue(1);
19751 } else
19752 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
19753 }
19754
19755 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
19756
19757 // FIXME This causes a redundant load/store if the SSE-class value is already
19758 // in memory, such as if it is on the callstack.
19759 if (isScalarFPTypeInSSEReg(TheVT)) {
19760 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19760, __PRETTY_FUNCTION__))
;
19761 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
19762 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19763 SDValue Ops[] = { Chain, StackSlot };
19764
19765 unsigned FLDSize = TheVT.getStoreSize();
19766 assert(FLDSize <= MemSize && "Stack slot not big enough")((FLDSize <= MemSize && "Stack slot not big enough"
) ? static_cast<void> (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19766, __PRETTY_FUNCTION__))
;
19767 MachineMemOperand *MMO = MF.getMachineMemOperand(
19768 MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
19769 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
19770 Chain = Value.getValue(1);
19771 }
19772
19773 // Build the FP_TO_INT*_IN_MEM
19774 MachineMemOperand *MMO = MF.getMachineMemOperand(
19775 MPI, MachineMemOperand::MOStore, MemSize, MemSize);
19776 SDValue Ops[] = { Chain, Value, StackSlot };
19777 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
19778 DAG.getVTList(MVT::Other),
19779 Ops, DstTy, MMO);
19780
19781 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
19782 Chain = Res.getValue(1);
19783
19784 // If we need an unsigned fixup, XOR the result with adjust.
19785 if (UnsignedFixup)
19786 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
19787
19788 return Res;
19789}
19790
19791static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
19792 const X86Subtarget &Subtarget) {
19793 MVT VT = Op.getSimpleValueType();
19794 SDValue In = Op.getOperand(0);
19795 MVT InVT = In.getSimpleValueType();
19796 SDLoc dl(Op);
19797 unsigned Opc = Op.getOpcode();
19798
19799 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19799, __PRETTY_FUNCTION__))
;
19800 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __PRETTY_FUNCTION__))
19801 "Unexpected extension opcode")(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19801, __PRETTY_FUNCTION__))
;
19802 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19803, __PRETTY_FUNCTION__))
19803 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19803, __PRETTY_FUNCTION__))
;
19804 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19807, __PRETTY_FUNCTION__))
19805 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19807, __PRETTY_FUNCTION__))
19806 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19807, __PRETTY_FUNCTION__))
19807 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19807, __PRETTY_FUNCTION__))
;
19808 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19811, __PRETTY_FUNCTION__))
19809 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19811, __PRETTY_FUNCTION__))
19810 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19811, __PRETTY_FUNCTION__))
19811 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19811, __PRETTY_FUNCTION__))
;
19812
19813 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
19814
19815 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
19816 if (InVT == MVT::v8i8) {
19817 if (VT != MVT::v8i64)
19818 return SDValue();
19819
19820 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
19821 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
19822 return DAG.getNode(ExtendInVecOpc, dl, VT, In);
19823 }
19824
19825 if (Subtarget.hasInt256())
19826 return Op;
19827
19828 // Optimize vectors in AVX mode:
19829 //
19830 // v8i16 -> v8i32
19831 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
19832 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
19833 // Concat upper and lower parts.
19834 //
19835 // v4i32 -> v4i64
19836 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
19837 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
19838 // Concat upper and lower parts.
19839 //
19840 MVT HalfVT = VT.getHalfNumVectorElementsVT();
19841 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
19842
19843 // Short-circuit if we can determine that each 128-bit half is the same value.
19844 // Otherwise, this is difficult to match and optimize.
19845 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
19846 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
19847 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
19848
19849 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
19850 SDValue Undef = DAG.getUNDEF(InVT);
19851 bool NeedZero = Opc == ISD::ZERO_EXTEND;
19852 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
19853 OpHi = DAG.getBitcast(HalfVT, OpHi);
19854
19855 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19856}
19857
19858// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
19859static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
19860 const SDLoc &dl, SelectionDAG &DAG) {
19861 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19861, __PRETTY_FUNCTION__))
;
19862 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19863 DAG.getIntPtrConstant(0, dl));
19864 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19865 DAG.getIntPtrConstant(8, dl));
19866 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
19867 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
19868 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
19869 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19870}
19871
19872static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
19873 const X86Subtarget &Subtarget,
19874 SelectionDAG &DAG) {
19875 MVT VT = Op->getSimpleValueType(0);
19876 SDValue In = Op->getOperand(0);
19877 MVT InVT = In.getSimpleValueType();
19878 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19878, __PRETTY_FUNCTION__))
;
19879 SDLoc DL(Op);
19880 unsigned NumElts = VT.getVectorNumElements();
19881
19882 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
19883 // avoids a constant pool load.
19884 if (VT.getVectorElementType() != MVT::i8) {
19885 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
19886 return DAG.getNode(ISD::SRL, DL, VT, Extend,
19887 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
19888 }
19889
19890 // Extend VT if BWI is not supported.
19891 MVT ExtVT = VT;
19892 if (!Subtarget.hasBWI()) {
19893 // If v16i32 is to be avoided, we'll need to split and concatenate.
19894 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19895 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
19896
19897 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19898 }
19899
19900 // Widen to 512-bits if VLX is not supported.
19901 MVT WideVT = ExtVT;
19902 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19903 NumElts *= 512 / ExtVT.getSizeInBits();
19904 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19905 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
19906 In, DAG.getIntPtrConstant(0, DL));
19907 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
19908 NumElts);
19909 }
19910
19911 SDValue One = DAG.getConstant(1, DL, WideVT);
19912 SDValue Zero = DAG.getConstant(0, DL, WideVT);
19913
19914 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
19915
19916 // Truncate if we had to extend above.
19917 if (VT != ExtVT) {
19918 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
19919 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
19920 }
19921
19922 // Extract back to 128/256-bit if we widened.
19923 if (WideVT != VT)
19924 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
19925 DAG.getIntPtrConstant(0, DL));
19926
19927 return SelectedVal;
19928}
19929
19930static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19931 SelectionDAG &DAG) {
19932 SDValue In = Op.getOperand(0);
19933 MVT SVT = In.getSimpleValueType();
19934
19935 if (SVT.getVectorElementType() == MVT::i1)
19936 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
19937
19938 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19938, __PRETTY_FUNCTION__))
;
19939 return LowerAVXExtend(Op, DAG, Subtarget);
19940}
19941
19942/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
19943/// It makes use of the fact that vectors with enough leading sign/zero bits
19944/// prevent the PACKSS/PACKUS from saturating the results.
19945/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
19946/// within each 128-bit lane.
19947static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
19948 const SDLoc &DL, SelectionDAG &DAG,
19949 const X86Subtarget &Subtarget) {
19950 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19951, __PRETTY_FUNCTION__))
19951 "Unexpected PACK opcode")(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19951, __PRETTY_FUNCTION__))
;
19952 assert(DstVT.isVector() && "VT not a vector?")((DstVT.isVector() && "VT not a vector?") ? static_cast
<void> (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19952, __PRETTY_FUNCTION__))
;
19953
19954 // Requires SSE2 but AVX512 has fast vector truncate.
19955 if (!Subtarget.hasSSE2())
19956 return SDValue();
19957
19958 EVT SrcVT = In.getValueType();
19959
19960 // No truncation required, we might get here due to recursive calls.
19961 if (SrcVT == DstVT)
19962 return In;
19963
19964 // We only support vector truncation to 64bits or greater from a
19965 // 128bits or greater source.
19966 unsigned DstSizeInBits = DstVT.getSizeInBits();
19967 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
19968 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
19969 return SDValue();
19970
19971 unsigned NumElems = SrcVT.getVectorNumElements();
19972 if (!isPowerOf2_32(NumElems))
19973 return SDValue();
19974
19975 LLVMContext &Ctx = *DAG.getContext();
19976 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19976, __PRETTY_FUNCTION__))
;
19977 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((SrcSizeInBits > DstSizeInBits && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19977, __PRETTY_FUNCTION__))
;
19978
19979 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
19980
19981 // Pack to the largest type possible:
19982 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
19983 EVT InVT = MVT::i16, OutVT = MVT::i8;
19984 if (SrcVT.getScalarSizeInBits() > 16 &&
19985 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
19986 InVT = MVT::i32;
19987 OutVT = MVT::i16;
19988 }
19989
19990 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
19991 if (SrcVT.is128BitVector()) {
19992 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
19993 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
19994 In = DAG.getBitcast(InVT, In);
19995 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
19996 Res = extractSubVector(Res, 0, DAG, DL, 64);
19997 return DAG.getBitcast(DstVT, Res);
19998 }
19999
20000 // Extract lower/upper subvectors.
20001 unsigned NumSubElts = NumElems / 2;
20002 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
20003 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
20004
20005 unsigned SubSizeInBits = SrcSizeInBits / 2;
20006 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20007 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20008
20009 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20010 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20011 Lo = DAG.getBitcast(InVT, Lo);
20012 Hi = DAG.getBitcast(InVT, Hi);
20013 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20014 return DAG.getBitcast(DstVT, Res);
20015 }
20016
20017 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20018 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20019 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20020 Lo = DAG.getBitcast(InVT, Lo);
20021 Hi = DAG.getBitcast(InVT, Hi);
20022 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20023
20024 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20025 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20026 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20027 SmallVector<int, 64> Mask;
20028 int Scale = 64 / OutVT.getScalarSizeInBits();
20029 scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
20030 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20031
20032 if (DstVT.is256BitVector())
20033 return DAG.getBitcast(DstVT, Res);
20034
20035 // If 512bit -> 128bit truncate another stage.
20036 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20037 Res = DAG.getBitcast(PackedVT, Res);
20038 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20039 }
20040
20041 // Recursively pack lower/upper subvectors, concat result and pack again.
20042 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20042, __PRETTY_FUNCTION__))
;
20043 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
20044 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
20045 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
20046
20047 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20048 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20049 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20050}
20051
20052static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20053 const X86Subtarget &Subtarget) {
20054
20055 SDLoc DL(Op);
20056 MVT VT = Op.getSimpleValueType();
20057 SDValue In = Op.getOperand(0);
20058 MVT InVT = In.getSimpleValueType();
20059
20060 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20060, __PRETTY_FUNCTION__))
;
20061
20062 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20063 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20064 if (InVT.getScalarSizeInBits() <= 16) {
20065 if (Subtarget.hasBWI()) {
20066 // legal, will go to VPMOVB2M, VPMOVW2M
20067 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20068 // We need to shift to get the lsb into sign position.
20069 // Shift packed bytes not supported natively, bitcast to word
20070 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20071 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20072 DAG.getBitcast(ExtVT, In),
20073 DAG.getConstant(ShiftInx, DL, ExtVT));
20074 In = DAG.getBitcast(InVT, In);
20075 }
20076 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20077 In, ISD::SETGT);
20078 }
20079 // Use TESTD/Q, extended vector to packed dword/qword.
20080 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20081, __PRETTY_FUNCTION__))
20081 "Unexpected vector type.")(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20081, __PRETTY_FUNCTION__))
;
20082 unsigned NumElts = InVT.getVectorNumElements();
20083 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20083, __PRETTY_FUNCTION__))
;
20084 // We need to change to a wider element type that we have support for.
20085 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20086 // For 16 element vectors we extend to v16i32 unless we are explicitly
20087 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20088 // we need to split into two 8 element vectors which we can extend to v8i32,
20089 // truncate and concat the results. There's an additional complication if
20090 // the original type is v16i8. In that case we can't split the v16i8 so
20091 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
20092 // to v8i32, truncate that to v8i1 and concat the two halves.
20093 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20094 if (InVT == MVT::v16i8) {
20095 // First we need to sign extend up to 256-bits so we can split that.
20096 InVT = MVT::v16i16;
20097 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
20098 }
20099 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
20100 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
20101 // We're split now, just emit two truncates and a concat. The two
20102 // truncates will trigger legalization to come back to this function.
20103 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20104 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20106 }
20107 // We either have 8 elements or we're allowed to use 512-bit vectors.
20108 // If we have VLX, we want to use the narrowest vector that can get the
20109 // job done so we use vXi32.
20110 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20111 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20112 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20113 InVT = ExtVT;
20114 ShiftInx = InVT.getScalarSizeInBits() - 1;
20115 }
20116
20117 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20118 // We need to shift to get the lsb into sign position.
20119 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20120 DAG.getConstant(ShiftInx, DL, InVT));
20121 }
20122 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20123 if (Subtarget.hasDQI())
20124 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20125 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20126}
20127
20128SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20129 SDLoc DL(Op);
20130 MVT VT = Op.getSimpleValueType();
20131 SDValue In = Op.getOperand(0);
20132 MVT InVT = In.getSimpleValueType();
20133 unsigned InNumEltBits = InVT.getScalarSizeInBits();
20134
20135 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20136, __PRETTY_FUNCTION__))
20136 "Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20136, __PRETTY_FUNCTION__))
;
20137
20138 // If we're called by the type legalizer, handle a few cases.
20139 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20140 if (!TLI.isTypeLegal(InVT)) {
20141 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20142 VT.is128BitVector()) {
20143 assert(Subtarget.hasVLX() && "Unexpected subtarget!")((Subtarget.hasVLX() && "Unexpected subtarget!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVLX() && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20143, __PRETTY_FUNCTION__))
;
20144 // The default behavior is to truncate one step, concatenate, and then
20145 // truncate the remainder. We'd rather produce two 64-bit results and
20146 // concatenate those.
20147 SDValue Lo, Hi;
20148 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20149
20150 EVT LoVT, HiVT;
20151 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20152
20153 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20154 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20155 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20156 }
20157
20158 // Otherwise let default legalization handle it.
20159 return SDValue();
20160 }
20161
20162 if (VT.getVectorElementType() == MVT::i1)
20163 return LowerTruncateVecI1(Op, DAG, Subtarget);
20164
20165 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20166 if (Subtarget.hasAVX512()) {
20167 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20168 // and then truncate that. But we should only do that if we haven't been
20169 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20170 // handled by isel patterns.
20171 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20172 Subtarget.canExtendTo512DQ())
20173 return Op;
20174 }
20175
20176 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
20177 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20178
20179 // Truncate with PACKUS if we are truncating a vector with leading zero bits
20180 // that extend all the way to the packed/truncated value.
20181 // Pre-SSE41 we can only use PACKUSWB.
20182 KnownBits Known = DAG.computeKnownBits(In);
20183 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
20184 if (SDValue V =
20185 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
20186 return V;
20187
20188 // Truncate with PACKSS if we are truncating a vector with sign-bits that
20189 // extend all the way to the packed/truncated value.
20190 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
20191 if (SDValue V =
20192 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
20193 return V;
20194
20195 // Handle truncation of V256 to V128 using shuffles.
20196 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((VT.is128BitVector() && InVT.is256BitVector() &&
"Unexpected types!") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20196, __PRETTY_FUNCTION__))
;
20197
20198 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20199 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20200 if (Subtarget.hasInt256()) {
20201 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20202 In = DAG.getBitcast(MVT::v8i32, In);
20203 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20204 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20205 DAG.getIntPtrConstant(0, DL));
20206 }
20207
20208 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20209 DAG.getIntPtrConstant(0, DL));
20210 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20211 DAG.getIntPtrConstant(2, DL));
20212 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
20213 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
20214 static const int ShufMask[] = {0, 2, 4, 6};
20215 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
20216 }
20217
20218 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20219 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20220 if (Subtarget.hasInt256()) {
20221 In = DAG.getBitcast(MVT::v32i8, In);
20222
20223 // The PSHUFB mask:
20224 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20225 -1, -1, -1, -1, -1, -1, -1, -1,
20226 16, 17, 20, 21, 24, 25, 28, 29,
20227 -1, -1, -1, -1, -1, -1, -1, -1 };
20228 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20229 In = DAG.getBitcast(MVT::v4i64, In);
20230
20231 static const int ShufMask2[] = {0, 2, -1, -1};
20232 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20233 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20234 DAG.getIntPtrConstant(0, DL));
20235 return DAG.getBitcast(VT, In);
20236 }
20237
20238 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
20239 DAG.getIntPtrConstant(0, DL));
20240
20241 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
20242 DAG.getIntPtrConstant(4, DL));
20243
20244 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
20245 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
20246
20247 // The PSHUFB mask:
20248 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
20249 -1, -1, -1, -1, -1, -1, -1, -1};
20250
20251 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
20252 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
20253
20254 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
20255 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
20256
20257 // The MOVLHPS Mask:
20258 static const int ShufMask2[] = {0, 1, 4, 5};
20259 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
20260 return DAG.getBitcast(MVT::v8i16, res);
20261 }
20262
20263 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
20264 // Use an AND to zero uppper bits for PACKUS.
20265 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
20266
20267 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
20268 DAG.getIntPtrConstant(0, DL));
20269 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
20270 DAG.getIntPtrConstant(8, DL));
20271 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
20272 }
20273
20274 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20274)
;
20275}
20276
20277SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20278 bool IsStrict = Op->isStrictFPOpcode();
20279 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20280 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20281 MVT VT = Op->getSimpleValueType(0);
20282 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20283 MVT SrcVT = Src.getSimpleValueType();
20284 SDLoc dl(Op);
20285
20286 if (VT.isVector()) {
20287 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20288 MVT ResVT = MVT::v4i32;
20289 MVT TruncVT = MVT::v4i1;
20290 unsigned Opc;
20291 if (IsStrict)
20292 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
20293 else
20294 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20295
20296 if (!IsSigned && !Subtarget.hasVLX()) {
20297 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((Subtarget.useAVX512Regs() && "Unexpected features!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20297, __PRETTY_FUNCTION__))
;
20298 // Widen to 512-bits.
20299 ResVT = MVT::v8i32;
20300 TruncVT = MVT::v8i1;
20301 Opc = Op.getOpcode();
20302 // Need to concat with zero vector for strict fp to avoid spurious
20303 // exceptions.
20304 // TODO: Should we just do this for non-strict as well?
20305 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20306 : DAG.getUNDEF(MVT::v8f64);
20307 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20308 DAG.getIntPtrConstant(0, dl));
20309 }
20310 SDValue Res, Chain;
20311 if (IsStrict) {
20312 Res =
20313 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
20314 Chain = Res.getValue(1);
20315 } else {
20316 Res = DAG.getNode(Opc, dl, ResVT, Src);
20317 }
20318
20319 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20320 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20321 DAG.getIntPtrConstant(0, dl));
20322 if (IsStrict)
20323 return DAG.getMergeValues({Res, Chain}, dl);
20324 return Res;
20325 }
20326
20327 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20328 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20329 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20329, __PRETTY_FUNCTION__))
;
20330 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((Subtarget.useAVX512Regs() && "Requires avx512f") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20330, __PRETTY_FUNCTION__))
;
20331 return Op;
20332 }
20333
20334 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20335 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20336 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
20337 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20337, __PRETTY_FUNCTION__))
;
20338 assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20339, __PRETTY_FUNCTION__))
20339 "Unexpected features!")((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20339, __PRETTY_FUNCTION__))
;
20340 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20341 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20342 // Need to concat with zero vector for strict fp to avoid spurious
20343 // exceptions.
20344 // TODO: Should we just do this for non-strict as well?
20345 SDValue Tmp =
20346 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20347 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20348 DAG.getIntPtrConstant(0, dl));
20349
20350 SDValue Res, Chain;
20351 if (IsStrict) {
20352 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20353 {Op->getOperand(0), Src});
20354 Chain = Res.getValue(1);
20355 } else {
20356 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20357 }
20358
20359 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20360 DAG.getIntPtrConstant(0, dl));
20361
20362 if (IsStrict)
20363 return DAG.getMergeValues({Res, Chain}, dl);
20364 return Res;
20365 }
20366
20367 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20368 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20369 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
20370 assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20371, __PRETTY_FUNCTION__))
20371 !Subtarget.hasVLX() && "Unexpected features!")((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20371, __PRETTY_FUNCTION__))
;
20372 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20373 // Need to concat with zero vector for strict fp to avoid spurious
20374 // exceptions.
20375 // TODO: Should we just do this for non-strict as well?
20376 SDValue Tmp =
20377 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20378 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20379 DAG.getIntPtrConstant(0, dl));
20380
20381 SDValue Res, Chain;
20382 if (IsStrict) {
20383 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20384 {Op->getOperand(0), Src});
20385 Chain = Res.getValue(1);
20386 } else {
20387 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
20388 }
20389
20390 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20391 DAG.getIntPtrConstant(0, dl));
20392
20393 if (IsStrict)
20394 return DAG.getMergeValues({Res, Chain}, dl);
20395 return Res;
20396 }
20397
20398 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
20399 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20399, __PRETTY_FUNCTION__))
;
20400 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
20401 DAG.getUNDEF(MVT::v2f32));
20402 if (IsStrict) {
20403 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
20404 : X86ISD::STRICT_CVTTP2UI;
20405 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
20406 }
20407 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20408 return DAG.getNode(Opc, dl, VT, Tmp);
20409 }
20410
20411 return SDValue();
20412 }
20413
20414 assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20414, __PRETTY_FUNCTION__))
;
20415
20416 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
20417
20418 if (!IsSigned && UseSSEReg) {
20419 // Conversions from f32/f64 with AVX512 should be legal.
20420 if (Subtarget.hasAVX512())
20421 return Op;
20422
20423 // Use default expansion for i64.
20424 if (VT == MVT::i64)
20425 return SDValue();
20426
20427 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20427, __PRETTY_FUNCTION__))
;
20428
20429 // Promote i32 to i64 and use a signed operation on 64-bit targets.
20430 // FIXME: This does not generate an invalid exception if the input does not
20431 // fit in i32. PR44019
20432 if (Subtarget.is64Bit()) {
20433 SDValue Res, Chain;
20434 if (IsStrict) {
20435 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
20436 { Op.getOperand(0), Src });
20437 Chain = Res.getValue(1);
20438 } else
20439 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
20440
20441 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20442 if (IsStrict)
20443 return DAG.getMergeValues({ Res, Chain }, dl);
20444 return Res;
20445 }
20446
20447 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
20448 // use fisttp which will be handled later.
20449 if (!Subtarget.hasSSE3())
20450 return SDValue();
20451 }
20452
20453 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
20454 // FIXME: This does not generate an invalid exception if the input does not
20455 // fit in i16. PR44019
20456 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
20457 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? static_cast<void> (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20457, __PRETTY_FUNCTION__))
;
20458 SDValue Res, Chain;
20459 if (IsStrict) {
20460 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
20461 { Op.getOperand(0), Src });
20462 Chain = Res.getValue(1);
20463 } else
20464 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
20465
20466 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20467 if (IsStrict)
20468 return DAG.getMergeValues({ Res, Chain }, dl);
20469 return Res;
20470 }
20471
20472 // If this is a FP_TO_SINT using SSEReg we're done.
20473 if (UseSSEReg && IsSigned)
20474 return Op;
20475
20476 // fp128 needs to use a libcall.
20477 if (SrcVT == MVT::f128) {
20478 RTLIB::Libcall LC;
20479 if (IsSigned)
20480 LC = RTLIB::getFPTOSINT(SrcVT, VT);
20481 else
20482 LC = RTLIB::getFPTOUINT(SrcVT, VT);
20483
20484 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20485 MakeLibCallOptions CallOptions;
20486 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
20487 SDLoc(Op), Chain);
20488
20489 if (IsStrict)
20490 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20491
20492 return Tmp.first;
20493 }
20494
20495 // Fall back to X87.
20496 SDValue Chain;
20497 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
20498 if (IsStrict)
20499 return DAG.getMergeValues({V, Chain}, dl);
20500 return V;
20501 }
20502
20503 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20503)
;
20504}
20505
20506SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
20507 SelectionDAG &DAG) const {
20508 SDValue Src = Op.getOperand(0);
20509 MVT SrcVT = Src.getSimpleValueType();
20510
20511 // If the source is in an SSE register, the node is Legal.
20512 if (isScalarFPTypeInSSEReg(SrcVT))
20513 return Op;
20514
20515 return LRINT_LLRINTHelper(Op.getNode(), DAG);
20516}
20517
20518SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
20519 SelectionDAG &DAG) const {
20520 EVT DstVT = N->getValueType(0);
20521 SDValue Src = N->getOperand(0);
20522 EVT SrcVT = Src.getValueType();
20523
20524 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
20525 // f16 must be promoted before using the lowering in this routine.
20526 // fp128 does not use this lowering.
20527 return SDValue();
20528 }
20529
20530 SDLoc DL(N);
20531 SDValue Chain = DAG.getEntryNode();
20532
20533 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
20534
20535 // If we're converting from SSE, the stack slot needs to hold both types.
20536 // Otherwise it only needs to hold the DstVT.
20537 EVT OtherVT = UseSSE ? SrcVT : DstVT;
20538 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
20539 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
20540 MachinePointerInfo MPI =
20541 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
20542
20543 if (UseSSE) {
20544 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20544, __PRETTY_FUNCTION__))
;
20545 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
20546 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20547 SDValue Ops[] = { Chain, StackPtr };
20548
20549 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
20550 /*Align*/0, MachineMemOperand::MOLoad);
20551 Chain = Src.getValue(1);
20552 }
20553
20554 SDValue StoreOps[] = { Chain, Src, StackPtr };
20555 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL,
20556 DAG.getVTList(MVT::Other), StoreOps,
20557 DstVT, MPI, /*Align*/0,
20558 MachineMemOperand::MOStore);
20559
20560 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
20561}
20562
20563SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20564 bool IsStrict = Op->isStrictFPOpcode();
20565
20566 SDLoc DL(Op);
20567 MVT VT = Op.getSimpleValueType();
20568 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
20569 MVT SVT = In.getSimpleValueType();
20570
20571 if (VT == MVT::f128) {
20572 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
20573 return LowerF128Call(Op, DAG, LC);
20574 }
20575
20576 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20576, __PRETTY_FUNCTION__))
;
20577
20578 SDValue Res =
20579 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
20580 if (IsStrict)
20581 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
20582 {Op->getOperand(0), Res});
20583 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
20584}
20585
20586SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20587 bool IsStrict = Op->isStrictFPOpcode();
20588
20589 MVT VT = Op.getSimpleValueType();
20590 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
20591 MVT SVT = In.getSimpleValueType();
20592
20593 // It's legal except when f128 is involved
20594 if (SVT != MVT::f128)
20595 return Op;
20596
20597 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
20598
20599 // FP_ROUND node has a second operand indicating whether it is known to be
20600 // precise. That doesn't take part in the LibCall so we can't directly use
20601 // LowerF128Call.
20602
20603 SDLoc dl(Op);
20604 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20605 MakeLibCallOptions CallOptions;
20606 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
20607 dl, Chain);
20608
20609 if (IsStrict)
20610 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20611
20612 return Tmp.first;
20613}
20614
20615static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
20616 bool IsStrict = Op->isStrictFPOpcode();
20617 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20618 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20619, __PRETTY_FUNCTION__))
20619 "Unexpected VT!")((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20619, __PRETTY_FUNCTION__))
;
20620
20621 SDLoc dl(Op);
20622 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
20623 DAG.getConstant(0, dl, MVT::v8i16), Src,
20624 DAG.getIntPtrConstant(0, dl));
20625
20626 SDValue Chain;
20627 if (IsStrict) {
20628 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
20629 {Op.getOperand(0), Res});
20630 Chain = Res.getValue(1);
20631 } else {
20632 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
20633 }
20634
20635 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
20636 DAG.getIntPtrConstant(0, dl));
20637
20638 if (IsStrict)
20639 return DAG.getMergeValues({Res, Chain}, dl);
20640
20641 return Res;
20642}
20643
20644static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
20645 bool IsStrict = Op->isStrictFPOpcode();
20646 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20647 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20648, __PRETTY_FUNCTION__))
20648 "Unexpected VT!")((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20648, __PRETTY_FUNCTION__))
;
20649
20650 SDLoc dl(Op);
20651 SDValue Res, Chain;
20652 if (IsStrict) {
20653 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
20654 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
20655 DAG.getIntPtrConstant(0, dl));
20656 Res = DAG.getNode(
20657 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
20658 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
20659 Chain = Res.getValue(1);
20660 } else {
20661 // FIXME: Should we use zeros for upper elements for non-strict?
20662 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
20663 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
20664 DAG.getTargetConstant(4, dl, MVT::i32));
20665 }
20666
20667 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
20668 DAG.getIntPtrConstant(0, dl));
20669
20670 if (IsStrict)
20671 return DAG.getMergeValues({Res, Chain}, dl);
20672
20673 return Res;
20674}
20675
20676/// Depending on uarch and/or optimizing for size, we might prefer to use a
20677/// vector operation in place of the typical scalar operation.
20678static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
20679 const X86Subtarget &Subtarget) {
20680 // If both operands have other uses, this is probably not profitable.
20681 SDValue LHS = Op.getOperand(0);
20682 SDValue RHS = Op.getOperand(1);
20683 if (!LHS.hasOneUse() && !RHS.hasOneUse())
20684 return Op;
20685
20686 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
20687 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
20688 if (IsFP && !Subtarget.hasSSE3())
20689 return Op;
20690 if (!IsFP && !Subtarget.hasSSSE3())
20691 return Op;
20692
20693 // Extract from a common vector.
20694 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20695 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20696 LHS.getOperand(0) != RHS.getOperand(0) ||
20697 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
20698 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
20699 !shouldUseHorizontalOp(true, DAG, Subtarget))
20700 return Op;
20701
20702 // Allow commuted 'hadd' ops.
20703 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
20704 unsigned HOpcode;
20705 switch (Op.getOpcode()) {
20706 case ISD::ADD: HOpcode = X86ISD::HADD; break;
20707 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
20708 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
20709 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
20710 default:
20711 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20711)
;
20712 }
20713 unsigned LExtIndex = LHS.getConstantOperandVal(1);
20714 unsigned RExtIndex = RHS.getConstantOperandVal(1);
20715 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
20716 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
20717 std::swap(LExtIndex, RExtIndex);
20718
20719 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
20720 return Op;
20721
20722 SDValue X = LHS.getOperand(0);
20723 EVT VecVT = X.getValueType();
20724 unsigned BitWidth = VecVT.getSizeInBits();
20725 unsigned NumLanes = BitWidth / 128;
20726 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
20727 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20728, __PRETTY_FUNCTION__))
20728 "Not expecting illegal vector widths here")(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20728, __PRETTY_FUNCTION__))
;
20729
20730 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
20731 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
20732 SDLoc DL(Op);
20733 if (BitWidth == 256 || BitWidth == 512) {
20734 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
20735 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
20736 LExtIndex %= NumEltsPerLane;
20737 }
20738
20739 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
20740 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
20741 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
20742 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
20743 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
20744 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
20745 DAG.getIntPtrConstant(LExtIndex / 2, DL));
20746}
20747
20748/// Depending on uarch and/or optimizing for size, we might prefer to use a
20749/// vector operation in place of the typical scalar operation.
20750SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
20751 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20752, __PRETTY_FUNCTION__))
20752 "Only expecting float/double")(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20752, __PRETTY_FUNCTION__))
;
20753 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
20754}
20755
20756/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
20757/// This mode isn't supported in hardware on X86. But as long as we aren't
20758/// compiling with trapping math, we can emulate this with
20759/// floor(X + copysign(nextafter(0.5, 0.0), X)).
20760static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
20761 SDValue N0 = Op.getOperand(0);
20762 SDLoc dl(Op);
20763 MVT VT = Op.getSimpleValueType();
20764
20765 // N0 += copysign(nextafter(0.5, 0.0), N0)
20766 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20767 bool Ignored;
20768 APFloat Point5Pred = APFloat(0.5f);
20769 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
20770 Point5Pred.next(/*nextDown*/true);
20771
20772 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
20773 DAG.getConstantFP(Point5Pred, dl, VT), N0);
20774 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
20775
20776 // Truncate the result to remove fraction.
20777 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
20778}
20779
20780/// The only differences between FABS and FNEG are the mask and the logic op.
20781/// FNEG also has a folding opportunity for FNEG(FABS(x)).
20782static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
20783 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20784, __PRETTY_FUNCTION__))
20784 "Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20784, __PRETTY_FUNCTION__))
;
20785
20786 bool IsFABS = (Op.getOpcode() == ISD::FABS);
20787
20788 // If this is a FABS and it has an FNEG user, bail out to fold the combination
20789 // into an FNABS. We'll lower the FABS after that if it is still in use.
20790 if (IsFABS)
20791 for (SDNode *User : Op->uses())
20792 if (User->getOpcode() == ISD::FNEG)
20793 return Op;
20794
20795 SDLoc dl(Op);
20796 MVT VT = Op.getSimpleValueType();
20797
20798 bool IsF128 = (VT == MVT::f128);
20799 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20802, __PRETTY_FUNCTION__))
20800 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20802, __PRETTY_FUNCTION__))
20801 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20802, __PRETTY_FUNCTION__))
20802 "Unexpected type in LowerFABSorFNEG")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20802, __PRETTY_FUNCTION__))
;
20803
20804 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
20805 // decide if we should generate a 16-byte constant mask when we only need 4 or
20806 // 8 bytes for the scalar case.
20807
20808 // There are no scalar bitwise logical SSE/AVX instructions, so we
20809 // generate a 16-byte vector constant and logic op even for the scalar case.
20810 // Using a 16-byte mask allows folding the load of the mask with
20811 // the logic op, so it can save (~4 bytes) on code size.
20812 bool IsFakeVector = !VT.isVector() && !IsF128;
20813 MVT LogicVT = VT;
20814 if (IsFakeVector)
20815 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
20816
20817 unsigned EltBits = VT.getScalarSizeInBits();
20818 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
20819 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
20820 APInt::getSignMask(EltBits);
20821 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20822 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
20823
20824 SDValue Op0 = Op.getOperand(0);
20825 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
20826 unsigned LogicOp = IsFABS ? X86ISD::FAND :
20827 IsFNABS ? X86ISD::FOR :
20828 X86ISD::FXOR;
20829 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
20830
20831 if (VT.isVector() || IsF128)
20832 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
20833
20834 // For the scalar case extend to a 128-bit vector, perform the logic op,
20835 // and extract the scalar result back out.
20836 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
20837 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
20838 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
20839 DAG.getIntPtrConstant(0, dl));
20840}
20841
20842static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
20843 SDValue Mag = Op.getOperand(0);
20844 SDValue Sign = Op.getOperand(1);
20845 SDLoc dl(Op);
20846
20847 // If the sign operand is smaller, extend it first.
20848 MVT VT = Op.getSimpleValueType();
20849 if (Sign.getSimpleValueType().bitsLT(VT))
20850 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
20851
20852 // And if it is bigger, shrink it first.
20853 if (Sign.getSimpleValueType().bitsGT(VT))
20854 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
20855
20856 // At this point the operands and the result should have the same
20857 // type, and that won't be f80 since that is not custom lowered.
20858 bool IsF128 = (VT == MVT::f128);
20859 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20862, __PRETTY_FUNCTION__))
20860 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20862, __PRETTY_FUNCTION__))
20861 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20862, __PRETTY_FUNCTION__))
20862 "Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20862, __PRETTY_FUNCTION__))
;
20863
20864 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20865
20866 // Perform all scalar logic operations as 16-byte vectors because there are no
20867 // scalar FP logic instructions in SSE.
20868 // TODO: This isn't necessary. If we used scalar types, we might avoid some
20869 // unnecessary splats, but we might miss load folding opportunities. Should
20870 // this decision be based on OptimizeForSize?
20871 bool IsFakeVector = !VT.isVector() && !IsF128;
20872 MVT LogicVT = VT;
20873 if (IsFakeVector)
20874 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
20875
20876 // The mask constants are automatically splatted for vector types.
20877 unsigned EltSizeInBits = VT.getScalarSizeInBits();
20878 SDValue SignMask = DAG.getConstantFP(
20879 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
20880 SDValue MagMask = DAG.getConstantFP(
20881 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
20882
20883 // First, clear all bits but the sign bit from the second operand (sign).
20884 if (IsFakeVector)
20885 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
20886 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
20887
20888 // Next, clear the sign bit from the first operand (magnitude).
20889 // TODO: If we had general constant folding for FP logic ops, this check
20890 // wouldn't be necessary.
20891 SDValue MagBits;
20892 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
20893 APFloat APF = Op0CN->getValueAPF();
20894 APF.clearSign();
20895 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
20896 } else {
20897 // If the magnitude operand wasn't a constant, we need to AND out the sign.
20898 if (IsFakeVector)
20899 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
20900 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
20901 }
20902
20903 // OR the magnitude value with the sign bit.
20904 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
20905 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
20906 DAG.getIntPtrConstant(0, dl));
20907}
20908
20909static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
20910 SDValue N0 = Op.getOperand(0);
20911 SDLoc dl(Op);
20912 MVT VT = Op.getSimpleValueType();
20913
20914 MVT OpVT = N0.getSimpleValueType();
20915 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20916, __PRETTY_FUNCTION__))
20916 "Unexpected type for FGETSIGN")(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20916, __PRETTY_FUNCTION__))
;
20917
20918 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
20919 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
20920 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
20921 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
20922 Res = DAG.getZExtOrTrunc(Res, dl, VT);
20923 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
20924 return Res;
20925}
20926
20927/// Helper for creating a X86ISD::SETCC node.
20928static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
20929 SelectionDAG &DAG) {
20930 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20931 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
20932}
20933
20934/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
20935/// style scalarized (associative) reduction patterns.
20936static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
20937 SmallVectorImpl<SDValue> &SrcOps) {
20938 SmallVector<SDValue, 8> Opnds;
20939 DenseMap<SDValue, APInt> SrcOpMap;
20940 EVT VT = MVT::Other;
20941
20942 // Recognize a special case where a vector is casted into wide integer to
20943 // test all 0s.
20944 assert(Op.getOpcode() == unsigned(BinOp) &&((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20945, __PRETTY_FUNCTION__))
20945 "Unexpected bit reduction opcode")((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20945, __PRETTY_FUNCTION__))
;
20946 Opnds.push_back(Op.getOperand(0));
20947 Opnds.push_back(Op.getOperand(1));
20948
20949 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
20950 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
20951 // BFS traverse all BinOp operands.
20952 if (I->getOpcode() == unsigned(BinOp)) {
20953 Opnds.push_back(I->getOperand(0));
20954 Opnds.push_back(I->getOperand(1));
20955 // Re-evaluate the number of nodes to be traversed.
20956 e += 2; // 2 more nodes (LHS and RHS) are pushed.
20957 continue;
20958 }
20959
20960 // Quit if a non-EXTRACT_VECTOR_ELT
20961 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20962 return false;
20963
20964 // Quit if without a constant index.
20965 SDValue Idx = I->getOperand(1);
20966 if (!isa<ConstantSDNode>(Idx))
20967 return false;
20968
20969 SDValue Src = I->getOperand(0);
20970 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
20971 if (M == SrcOpMap.end()) {
20972 VT = Src.getValueType();
20973 // Quit if not the same type.
20974 if (SrcOpMap.begin() != SrcOpMap.end() &&
20975 VT != SrcOpMap.begin()->first.getValueType())
20976 return false;
20977 unsigned NumElts = VT.getVectorNumElements();
20978 APInt EltCount = APInt::getNullValue(NumElts);
20979 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
20980 SrcOps.push_back(Src);
20981 }
20982 // Quit if element already used.
20983 unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
20984 if (M->second[CIdx])
20985 return false;
20986 M->second.setBit(CIdx);
20987 }
20988
20989 // Quit if not all elements are used.
20990 for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
20991 E = SrcOpMap.end();
20992 I != E; ++I) {
20993 if (!I->second.isAllOnesValue())
20994 return false;
20995 }
20996
20997 return true;
20998}
20999
21000// Check whether an OR'd tree is PTEST-able.
21001static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
21002 const X86Subtarget &Subtarget,
21003 SelectionDAG &DAG, SDValue &X86CC) {
21004 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree."
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21004, __PRETTY_FUNCTION__))
;
21005
21006 if (!Subtarget.hasSSE41() || !Op->hasOneUse())
21007 return SDValue();
21008
21009 SmallVector<SDValue, 8> VecIns;
21010 if (!matchScalarReduction(Op, ISD::OR, VecIns))
21011 return SDValue();
21012
21013 // Quit if not 128/256-bit vector.
21014 EVT VT = VecIns[0].getValueType();
21015 if (!VT.is128BitVector() && !VT.is256BitVector())
21016 return SDValue();
21017
21018 SDLoc DL(Op);
21019 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
21020
21021 // Cast all vectors into TestVT for PTEST.
21022 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
21023 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
21024
21025 // If more than one full vector is evaluated, OR them first before PTEST.
21026 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
21027 // Each iteration will OR 2 nodes and append the result until there is only
21028 // 1 node left, i.e. the final OR'd value of all vectors.
21029 SDValue LHS = VecIns[Slot];
21030 SDValue RHS = VecIns[Slot + 1];
21031 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
21032 }
21033
21034 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
21035 DL, MVT::i8);
21036 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
21037}
21038
21039/// return true if \c Op has a use that doesn't just read flags.
21040static bool hasNonFlagsUse(SDValue Op) {
21041 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
21042 ++UI) {
21043 SDNode *User = *UI;
21044 unsigned UOpNo = UI.getOperandNo();
21045 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
21046 // Look pass truncate.
21047 UOpNo = User->use_begin().getOperandNo();
21048 User = *User->use_begin();
21049 }
21050
21051 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
21052 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
21053 return true;
21054 }
21055 return false;
21056}
21057
21058// Transform to an x86-specific ALU node with flags if there is a chance of
21059// using an RMW op or only the flags are used. Otherwise, leave
21060// the node alone and emit a 'cmp' or 'test' instruction.
21061static bool isProfitableToUseFlagOp(SDValue Op) {
21062 for (SDNode *U : Op->uses())
21063 if (U->getOpcode() != ISD::CopyToReg &&
21064 U->getOpcode() != ISD::SETCC &&
21065 U->getOpcode() != ISD::STORE)
21066 return false;
21067
21068 return true;
21069}
21070
21071/// Emit nodes that will be selected as "test Op0,Op0", or something
21072/// equivalent.
21073static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
21074 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
21075 // CF and OF aren't always set the way we want. Determine which
21076 // of these we need.
21077 bool NeedCF = false;
21078 bool NeedOF = false;
21079 switch (X86CC) {
21080 default: break;
21081 case X86::COND_A: case X86::COND_AE:
21082 case X86::COND_B: case X86::COND_BE:
21083 NeedCF = true;
21084 break;
21085 case X86::COND_G: case X86::COND_GE:
21086 case X86::COND_L: case X86::COND_LE:
21087 case X86::COND_O: case X86::COND_NO: {
21088 // Check if we really need to set the
21089 // Overflow flag. If NoSignedWrap is present
21090 // that is not actually needed.
21091 switch (Op->getOpcode()) {
21092 case ISD::ADD:
21093 case ISD::SUB:
21094 case ISD::MUL:
21095 case ISD::SHL:
21096 if (Op.getNode()->getFlags().hasNoSignedWrap())
21097 break;
21098 LLVM_FALLTHROUGH[[gnu::fallthrough]];
21099 default:
21100 NeedOF = true;
21101 break;
21102 }
21103 break;
21104 }
21105 }
21106 // See if we can use the EFLAGS value from the operand instead of
21107 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
21108 // we prove that the arithmetic won't overflow, we can't use OF or CF.
21109 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
21110 // Emit a CMP with 0, which is the TEST pattern.
21111 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
21112 DAG.getConstant(0, dl, Op.getValueType()));
21113 }
21114 unsigned Opcode = 0;
21115 unsigned NumOperands = 0;
21116
21117 SDValue ArithOp = Op;
21118
21119 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
21120 // which may be the result of a CAST. We use the variable 'Op', which is the
21121 // non-casted variable when we check for possible users.
21122 switch (ArithOp.getOpcode()) {
21123 case ISD::AND:
21124 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
21125 // because a TEST instruction will be better.
21126 if (!hasNonFlagsUse(Op))
21127 break;
21128
21129 LLVM_FALLTHROUGH[[gnu::fallthrough]];
21130 case ISD::ADD:
21131 case ISD::SUB:
21132 case ISD::OR:
21133 case ISD::XOR:
21134 if (!isProfitableToUseFlagOp(Op))
21135 break;
21136
21137 // Otherwise use a regular EFLAGS-setting instruction.
21138 switch (ArithOp.getOpcode()) {
21139 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21139)
;
21140 case ISD::ADD: Opcode = X86ISD::ADD; break;
21141 case ISD::SUB: Opcode = X86ISD::SUB; break;
21142 case ISD::XOR: Opcode = X86ISD::XOR; break;
21143 case ISD::AND: Opcode = X86ISD::AND; break;
21144 case ISD::OR: Opcode = X86ISD::OR; break;
21145 }
21146
21147 NumOperands = 2;
21148 break;
21149 case X86ISD::ADD:
21150 case X86ISD::SUB:
21151 case X86ISD::OR:
21152 case X86ISD::XOR:
21153 case X86ISD::AND:
21154 return SDValue(Op.getNode(), 1);
21155 case ISD::SSUBO:
21156 case ISD::USUBO: {
21157 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
21158 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21159 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
21160 Op->getOperand(1)).getValue(1);
21161 }
21162 default:
21163 break;
21164 }
21165
21166 if (Opcode == 0) {
21167 // Emit a CMP with 0, which is the TEST pattern.
21168 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
21169 DAG.getConstant(0, dl, Op.getValueType()));
21170 }
21171 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21172 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
21173
21174 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
21175 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
21176 return SDValue(New.getNode(), 1);
21177}
21178
21179/// Emit nodes that will be selected as "cmp Op0,Op1", or something
21180/// equivalent.
21181static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
21182 const SDLoc &dl, SelectionDAG &DAG,
21183 const X86Subtarget &Subtarget) {
21184 if (isNullConstant(Op1))
21185 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
21186
21187 EVT CmpVT = Op0.getValueType();
21188
21189 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21190, __PRETTY_FUNCTION__))
21190 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21190, __PRETTY_FUNCTION__))
;
21191
21192 // Only promote the compare up to I32 if it is a 16 bit operation
21193 // with an immediate. 16 bit immediates are to be avoided.
21194 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
21195 !DAG.getMachineFunction().getFunction().hasMinSize()) {
21196 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
21197 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
21198 // Don't do this if the immediate can fit in 8-bits.
21199 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
21200 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
21201 unsigned ExtendOp =
21202 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21203 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
21204 // For equality comparisons try to use SIGN_EXTEND if the input was
21205 // truncate from something with enough sign bits.
21206 if (Op0.getOpcode() == ISD::TRUNCATE) {
21207 SDValue In = Op0.getOperand(0);
21208 unsigned EffBits =
21209 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
21210 if (EffBits <= 16)
21211 ExtendOp = ISD::SIGN_EXTEND;
21212 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
21213 SDValue In = Op1.getOperand(0);
21214 unsigned EffBits =
21215 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
21216 if (EffBits <= 16)
21217 ExtendOp = ISD::SIGN_EXTEND;
21218 }
21219 }
21220
21221 CmpVT = MVT::i32;
21222 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
21223 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
21224 }
21225 }
21226
21227 // Try to shrink i64 compares if the input has enough zero bits.
21228 // FIXME: Do this for non-constant compares for constant on LHS?
21229 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
21230 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
21231 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
21232 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
21233 CmpVT = MVT::i32;
21234 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
21235 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
21236 }
21237
21238 // 0-x == y --> x+y == 0
21239 // 0-x != y --> x+y != 0
21240 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
21241 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
21242 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21243 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
21244 return Add.getValue(1);
21245 }
21246
21247 // x == 0-y --> x+y == 0
21248 // x != 0-y --> x+y != 0
21249 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
21250 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
21251 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21252 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
21253 return Add.getValue(1);
21254 }
21255
21256 // Use SUB instead of CMP to enable CSE between SUB and CMP.
21257 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21258 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
21259 return Sub.getValue(1);
21260}
21261
21262/// Check if replacement of SQRT with RSQRT should be disabled.
21263bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
21264 EVT VT = Op.getValueType();
21265
21266 // We never want to use both SQRT and RSQRT instructions for the same input.
21267 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
21268 return false;
21269
21270 if (VT.isVector())
21271 return Subtarget.hasFastVectorFSQRT();
21272 return Subtarget.hasFastScalarFSQRT();
21273}
21274
21275/// The minimum architected relative accuracy is 2^-12. We need one
21276/// Newton-Raphson step to have a good float result (24 bits of precision).
21277SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
21278 SelectionDAG &DAG, int Enabled,
21279 int &RefinementSteps,
21280 bool &UseOneConstNR,
21281 bool Reciprocal) const {
21282 EVT VT = Op.getValueType();
21283
21284 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
21285 // It is likely not profitable to do this for f64 because a double-precision
21286 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
21287 // instructions: convert to single, rsqrtss, convert back to double, refine
21288 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
21289 // along with FMA, this could be a throughput win.
21290 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
21291 // after legalize types.
21292 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
21293 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
21294 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
21295 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
21296 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
21297 if (RefinementSteps == ReciprocalEstimate::Unspecified)
21298 RefinementSteps = 1;
21299
21300 UseOneConstNR = false;
21301 // There is no FSQRT for 512-bits, but there is RSQRT14.
21302 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
21303 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
21304 }
21305 return SDValue();
21306}
21307
21308/// The minimum architected relative accuracy is 2^-12. We need one
21309/// Newton-Raphson step to have a good float result (24 bits of precision).
21310SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
21311 int Enabled,
21312 int &RefinementSteps) const {
21313 EVT VT = Op.getValueType();
21314
21315 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
21316 // It is likely not profitable to do this for f64 because a double-precision
21317 // reciprocal estimate with refinement on x86 prior to FMA requires
21318 // 15 instructions: convert to single, rcpss, convert back to double, refine
21319 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
21320 // along with FMA, this could be a throughput win.
21321
21322 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
21323 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
21324 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
21325 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
21326 // Enable estimate codegen with 1 refinement step for vector division.
21327 // Scalar division estimates are disabled because they break too much
21328 // real-world code. These defaults are intended to match GCC behavior.
21329 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
21330 return SDValue();
21331
21332 if (RefinementSteps == ReciprocalEstimate::Unspecified)
21333 RefinementSteps = 1;
21334
21335 // There is no FSQRT for 512-bits, but there is RCP14.
21336 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
21337 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
21338 }
21339 return SDValue();
21340}
21341
21342/// If we have at least two divisions that use the same divisor, convert to
21343/// multiplication by a reciprocal. This may need to be adjusted for a given
21344/// CPU if a division's cost is not at least twice the cost of a multiplication.
21345/// This is because we still need one division to calculate the reciprocal and
21346/// then we need two multiplies by that reciprocal as replacements for the
21347/// original divisions.
21348unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
21349 return 2;
21350}
21351
21352SDValue
21353X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
21354 SelectionDAG &DAG,
21355 SmallVectorImpl<SDNode *> &Created) const {
21356 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
21357 if (isIntDivCheap(N->getValueType(0), Attr))
21358 return SDValue(N,0); // Lower SDIV as SDIV
21359
21360 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21361, __PRETTY_FUNCTION__))
21361 "Unexpected divisor!")(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21361, __PRETTY_FUNCTION__))
;
21362
21363 // Only perform this transform if CMOV is supported otherwise the select
21364 // below will become a branch.
21365 if (!Subtarget.hasCMov())
21366 return SDValue();
21367
21368 // fold (sdiv X, pow2)
21369 EVT VT = N->getValueType(0);
21370 // FIXME: Support i8.
21371 if (VT != MVT::i16 && VT != MVT::i32 &&
21372 !(Subtarget.is64Bit() && VT == MVT::i64))
21373 return SDValue();
21374
21375 unsigned Lg2 = Divisor.countTrailingZeros();
21376
21377 // If the divisor is 2 or -2, the default expansion is better.
21378 if (Lg2 == 1)
21379 return SDValue();
21380
21381 SDLoc DL(N);
21382 SDValue N0 = N->getOperand(0);
21383 SDValue Zero = DAG.getConstant(0, DL, VT);
21384 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
21385 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
21386
21387 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
21388 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
21389 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
21390 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
21391
21392 Created.push_back(Cmp.getNode());
21393 Created.push_back(Add.getNode());
21394 Created.push_back(CMov.getNode());
21395
21396 // Divide by pow2.
21397 SDValue SRA =
21398 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
21399
21400 // If we're dividing by a positive value, we're done. Otherwise, we must
21401 // negate the result.
21402 if (Divisor.isNonNegative())
21403 return SRA;
21404
21405 Created.push_back(SRA.getNode());
21406 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
21407}
21408
21409/// Result of 'and' is compared against zero. Change to a BT node if possible.
21410/// Returns the BT node and the condition code needed to use it.
21411static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
21412 const SDLoc &dl, SelectionDAG &DAG,
21413 SDValue &X86CC) {
21414 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((And.getOpcode() == ISD::AND && "Expected AND node!"
) ? static_cast<void> (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21414, __PRETTY_FUNCTION__))
;
21415 SDValue Op0 = And.getOperand(0);
21416 SDValue Op1 = And.getOperand(1);
21417 if (Op0.getOpcode() == ISD::TRUNCATE)
21418 Op0 = Op0.getOperand(0);
21419 if (Op1.getOpcode() == ISD::TRUNCATE)
21420 Op1 = Op1.getOperand(0);
21421
21422 SDValue Src, BitNo;
21423 if (Op1.getOpcode() == ISD::SHL)
21424 std::swap(Op0, Op1);
21425 if (Op0.getOpcode() == ISD::SHL) {
21426 if (isOneConstant(Op0.getOperand(0))) {
21427 // If we looked past a truncate, check that it's only truncating away
21428 // known zeros.
21429 unsigned BitWidth = Op0.getValueSizeInBits();
21430 unsigned AndBitWidth = And.getValueSizeInBits();
21431 if (BitWidth > AndBitWidth) {
21432 KnownBits Known = DAG.computeKnownBits(Op0);
21433 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
21434 return SDValue();
21435 }
21436 Src = Op1;
21437 BitNo = Op0.getOperand(1);
21438 }
21439 } else if (Op1.getOpcode() == ISD::Constant) {
21440 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
21441 uint64_t AndRHSVal = AndRHS->getZExtValue();
21442 SDValue AndLHS = Op0;
21443
21444 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
21445 Src = AndLHS.getOperand(0);
21446 BitNo = AndLHS.getOperand(1);
21447 } else {
21448 // Use BT if the immediate can't be encoded in a TEST instruction or we
21449 // are optimizing for size and the immedaite won't fit in a byte.
21450 bool OptForSize = DAG.shouldOptForSize();
21451 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
21452 isPowerOf2_64(AndRHSVal)) {
21453 Src = AndLHS;
21454 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
21455 Src.getValueType());
21456 }
21457 }
21458 }
21459
21460 // No patterns found, give up.
21461 if (!Src.getNode())
21462 return SDValue();
21463
21464 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21465 // instruction. Since the shift amount is in-range-or-undefined, we know
21466 // that doing a bittest on the i32 value is ok. We extend to i32 because
21467 // the encoding for the i16 version is larger than the i32 version.
21468 // Also promote i16 to i32 for performance / code size reason.
21469 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
21470 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
21471
21472 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21473 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21474 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21475 // known to be zero.
21476 if (Src.getValueType() == MVT::i64 &&
21477 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21478 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
21479
21480 // If the operand types disagree, extend the shift amount to match. Since
21481 // BT ignores high bits (like shifts) we can use anyextend.
21482 if (Src.getValueType() != BitNo.getValueType())
21483 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
21484
21485 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
21486 dl, MVT::i8);
21487 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
21488}
21489
21490/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
21491/// CMPs.
21492static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
21493 SDValue &Op1, bool &IsAlwaysSignaling) {
21494 unsigned SSECC;
21495 bool Swap = false;
21496
21497 // SSE Condition code mapping:
21498 // 0 - EQ
21499 // 1 - LT
21500 // 2 - LE
21501 // 3 - UNORD
21502 // 4 - NEQ
21503 // 5 - NLT
21504 // 6 - NLE
21505 // 7 - ORD
21506 switch (SetCCOpcode) {
21507 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21507)
;
21508 case ISD::SETOEQ:
21509 case ISD::SETEQ: SSECC = 0; break;
21510 case ISD::SETOGT:
21511 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21512 case ISD::SETLT:
21513 case ISD::SETOLT: SSECC = 1; break;
21514 case ISD::SETOGE:
21515 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21516 case ISD::SETLE:
21517 case ISD::SETOLE: SSECC = 2; break;
21518 case ISD::SETUO: SSECC = 3; break;
21519 case ISD::SETUNE:
21520 case ISD::SETNE: SSECC = 4; break;
21521 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21522 case ISD::SETUGE: SSECC = 5; break;
21523 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21524 case ISD::SETUGT: SSECC = 6; break;
21525 case ISD::SETO: SSECC = 7; break;
21526 case ISD::SETUEQ: SSECC = 8; break;
21527 case ISD::SETONE: SSECC = 12; break;
21528 }
21529 if (Swap)
21530 std::swap(Op0, Op1);
21531
21532 switch (SetCCOpcode) {
21533 default:
21534 IsAlwaysSignaling = true;
21535 break;
21536 case ISD::SETEQ:
21537 case ISD::SETOEQ:
21538 case ISD::SETUEQ:
21539 case ISD::SETNE:
21540 case ISD::SETONE:
21541 case ISD::SETUNE:
21542 case ISD::SETO:
21543 case ISD::SETUO:
21544 IsAlwaysSignaling = false;
21545 break;
21546 }
21547
21548 return SSECC;
21549}
21550
21551/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
21552/// concatenate the result back.
21553static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
21554 MVT VT = Op.getSimpleValueType();
21555
21556 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21557, __PRETTY_FUNCTION__))
21557 "Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21557, __PRETTY_FUNCTION__))
;
21558
21559 unsigned NumElems = VT.getVectorNumElements();
21560 SDLoc dl(Op);
21561 SDValue CC = Op.getOperand(2);
21562
21563 // Extract the LHS vectors
21564 SDValue LHS = Op.getOperand(0);
21565 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21566 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21567
21568 // Extract the RHS vectors
21569 SDValue RHS = Op.getOperand(1);
21570 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21571 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21572
21573 // Issue the operation on the smaller types and concatenate the result back
21574 MVT EltVT = VT.getVectorElementType();
21575 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21576 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21577 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
21578 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
21579}
21580
21581static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
21582
21583 SDValue Op0 = Op.getOperand(0);
21584 SDValue Op1 = Op.getOperand(1);
21585 SDValue CC = Op.getOperand(2);
21586 MVT VT = Op.getSimpleValueType();
21587 SDLoc dl(Op);
21588
21589 assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21590, __PRETTY_FUNCTION__))
21590 "Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21590, __PRETTY_FUNCTION__))
;
21591
21592 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
21593
21594 // Prefer SETGT over SETLT.
21595 if (SetCCOpcode == ISD::SETLT) {
21596 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
21597 std::swap(Op0, Op1);
21598 }
21599
21600 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
21601}
21602
21603/// Given a buildvector constant, return a new vector constant with each element
21604/// incremented or decremented. If incrementing or decrementing would result in
21605/// unsigned overflow or underflow or this is not a simple vector constant,
21606/// return an empty value.
21607static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
21608 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
21609 if (!BV)
21610 return SDValue();
21611
21612 MVT VT = V.getSimpleValueType();
21613 MVT EltVT = VT.getVectorElementType();
21614 unsigned NumElts = VT.getVectorNumElements();
21615 SmallVector<SDValue, 8> NewVecC;
21616 SDLoc DL(V);
21617 for (unsigned i = 0; i < NumElts; ++i) {
21618 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
21619 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
21620 return SDValue();
21621
21622 // Avoid overflow/underflow.
21623 const APInt &EltC = Elt->getAPIntValue();
21624 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
21625 return SDValue();
21626
21627 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
21628 }
21629
21630 return DAG.getBuildVector(VT, DL, NewVecC);
21631}
21632
21633/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
21634/// Op0 u<= Op1:
21635/// t = psubus Op0, Op1
21636/// pcmpeq t, <0..0>
21637static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
21638 ISD::CondCode Cond, const SDLoc &dl,
21639 const X86Subtarget &Subtarget,
21640 SelectionDAG &DAG) {
21641 if (!Subtarget.hasSSE2())
21642 return SDValue();
21643
21644 MVT VET = VT.getVectorElementType();
21645 if (VET != MVT::i8 && VET != MVT::i16)
21646 return SDValue();
21647
21648 switch (Cond) {
21649 default:
21650 return SDValue();
21651 case ISD::SETULT: {
21652 // If the comparison is against a constant we can turn this into a
21653 // setule. With psubus, setule does not require a swap. This is
21654 // beneficial because the constant in the register is no longer
21655 // destructed as the destination so it can be hoisted out of a loop.
21656 // Only do this pre-AVX since vpcmp* is no longer destructive.
21657 if (Subtarget.hasAVX())
21658 return SDValue();
21659 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
21660 if (!ULEOp1)
21661 return SDValue();
21662 Op1 = ULEOp1;
21663 break;
21664 }
21665 case ISD::SETUGT: {
21666 // If the comparison is against a constant, we can turn this into a setuge.
21667 // This is beneficial because materializing a constant 0 for the PCMPEQ is
21668 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
21669 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
21670 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
21671 if (!UGEOp1)
21672 return SDValue();
21673 Op1 = Op0;
21674 Op0 = UGEOp1;
21675 break;
21676 }
21677 // Psubus is better than flip-sign because it requires no inversion.
21678 case ISD::SETUGE:
21679 std::swap(Op0, Op1);
21680 break;
21681 case ISD::SETULE:
21682 break;
21683 }
21684
21685 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
21686 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
21687 DAG.getConstant(0, dl, VT));
21688}
21689
21690static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
21691 SelectionDAG &DAG) {
21692 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
21693 Op.getOpcode() == ISD::STRICT_FSETCCS;
21694 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
21695 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
21696 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
21697 MVT VT = Op->getSimpleValueType(0);
21698 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
21699 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
21700 SDLoc dl(Op);
21701
21702 if (isFP) {
21703#ifndef NDEBUG
21704 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
21705 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21705, __PRETTY_FUNCTION__))
;
21706#endif
21707
21708 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
21709 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21710
21711 // If we have a strict compare with a vXi1 result and the input is 128/256
21712 // bits we can't use a masked compare unless we have VLX. If we use a wider
21713 // compare like we do for non-strict, we might trigger spurious exceptions
21714 // from the upper elements. Instead emit a AVX compare and convert to mask.
21715 unsigned Opc;
21716 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
21717 (!IsStrict || Subtarget.hasVLX() ||
21718 Op0.getSimpleValueType().is512BitVector())) {
21719 assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21719, __PRETTY_FUNCTION__))
;
21720 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
21721 } else {
21722 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
21723 // The SSE/AVX packed FP comparison nodes are defined with a
21724 // floating-point vector result that matches the operand type. This allows
21725 // them to work with an SSE1 target (integer vector types are not legal).
21726 VT = Op0.getSimpleValueType();
21727 }
21728
21729 SDValue Cmp;
21730 bool IsAlwaysSignaling;
21731 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
21732 if (!Subtarget.hasAVX()) {
21733 // TODO: We could use following steps to handle a quiet compare with
21734 // signaling encodings.
21735 // 1. Get ordered masks from a quiet ISD::SETO
21736 // 2. Use the masks to mask potential unordered elements in operand A, B
21737 // 3. Get the compare results of masked A, B
21738 // 4. Calculating final result using the mask and result from 3
21739 // But currently, we just fall back to scalar operations.
21740 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
21741 return SDValue();
21742
21743 // Insert an extra signaling instruction to raise exception.
21744 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
21745 SDValue SignalCmp = DAG.getNode(
21746 Opc, dl, {VT, MVT::Other},
21747 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
21748 // FIXME: It seems we need to update the flags of all new strict nodes.
21749 // Otherwise, mayRaiseFPException in MI will return false due to
21750 // NoFPExcept = false by default. However, I didn't find it in other
21751 // patches.
21752 SignalCmp->setFlags(Op->getFlags());
21753 Chain = SignalCmp.getValue(1);
21754 }
21755
21756 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
21757 // emit two comparisons and a logic op to tie them together.
21758 if (SSECC >= 8) {
21759 // LLVM predicate is SETUEQ or SETONE.
21760 unsigned CC0, CC1;
21761 unsigned CombineOpc;
21762 if (Cond == ISD::SETUEQ) {
21763 CC0 = 3; // UNORD
21764 CC1 = 0; // EQ
21765 CombineOpc = X86ISD::FOR;
21766 } else {
21767 assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21767, __PRETTY_FUNCTION__))
;
21768 CC0 = 7; // ORD
21769 CC1 = 4; // NEQ
21770 CombineOpc = X86ISD::FAND;
21771 }
21772
21773 SDValue Cmp0, Cmp1;
21774 if (IsStrict) {
21775 Cmp0 = DAG.getNode(
21776 Opc, dl, {VT, MVT::Other},
21777 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
21778 Cmp1 = DAG.getNode(
21779 Opc, dl, {VT, MVT::Other},
21780 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
21781 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
21782 Cmp1.getValue(1));
21783 } else {
21784 Cmp0 = DAG.getNode(
21785 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
21786 Cmp1 = DAG.getNode(
21787 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
21788 }
21789 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
21790 } else {
21791 if (IsStrict) {
21792 Cmp = DAG.getNode(
21793 Opc, dl, {VT, MVT::Other},
21794 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
21795 Chain = Cmp.getValue(1);
21796 } else
21797 Cmp = DAG.getNode(
21798 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
21799 }
21800 } else {
21801 // Handle all other FP comparisons here.
21802 if (IsStrict) {
21803 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
21804 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
21805 Cmp = DAG.getNode(
21806 Opc, dl, {VT, MVT::Other},
21807 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
21808 Chain = Cmp.getValue(1);
21809 } else
21810 Cmp = DAG.getNode(
21811 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
21812 }
21813
21814 if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
21815 // We emitted a compare with an XMM/YMM result. Finish converting to a
21816 // mask register using a vptestm.
21817 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
21818 Cmp = DAG.getBitcast(CastVT, Cmp);
21819 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
21820 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
21821 } else {
21822 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
21823 // the result type of SETCC. The bitcast is expected to be optimized
21824 // away during combining/isel.
21825 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
21826 }
21827
21828 if (IsStrict)
21829 return DAG.getMergeValues({Cmp, Chain}, dl);
21830
21831 return Cmp;
21832 }
21833
21834 assert(!IsStrict && "Strict SETCC only handles FP operands.")((!IsStrict && "Strict SETCC only handles FP operands."
) ? static_cast<void> (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21834, __PRETTY_FUNCTION__))
;
21835
21836 MVT VTOp0 = Op0.getSimpleValueType();
21837 (void)VTOp0;
21838 assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __PRETTY_FUNCTION__))
21839 "Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21839, __PRETTY_FUNCTION__))
;
21840 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21841, __PRETTY_FUNCTION__))
21841 "Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21841, __PRETTY_FUNCTION__))
;
21842
21843 // The non-AVX512 code below works under the assumption that source and
21844 // destination types are the same.
21845 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21846, __PRETTY_FUNCTION__))
21846 "Value types for source and destination must be the same!")(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21846, __PRETTY_FUNCTION__))
;
21847
21848 // The result is boolean, but operands are int/float
21849 if (VT.getVectorElementType() == MVT::i1) {
21850 // In AVX-512 architecture setcc returns mask with i1 elements,
21851 // But there is no compare instruction for i8 and i16 elements in KNL.
21852 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21853, __PRETTY_FUNCTION__))
21853 "Unexpected operand type")(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21853, __PRETTY_FUNCTION__))
;
21854 return LowerIntVSETCC_AVX512(Op, DAG);
21855 }
21856
21857 // Lower using XOP integer comparisons.
21858 if (VT.is128BitVector() && Subtarget.hasXOP()) {
21859 // Translate compare code to XOP PCOM compare mode.
21860 unsigned CmpMode = 0;
21861 switch (Cond) {
21862 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21862)
;
21863 case ISD::SETULT:
21864 case ISD::SETLT: CmpMode = 0x00; break;
21865 case ISD::SETULE:
21866 case ISD::SETLE: CmpMode = 0x01; break;
21867 case ISD::SETUGT:
21868 case ISD::SETGT: CmpMode = 0x02; break;
21869 case ISD::SETUGE:
21870 case ISD::SETGE: CmpMode = 0x03; break;
21871 case ISD::SETEQ: CmpMode = 0x04; break;
21872 case ISD::SETNE: CmpMode = 0x05; break;
21873 }
21874
21875 // Are we comparing unsigned or signed integers?
21876 unsigned Opc =
21877 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
21878
21879 return DAG.getNode(Opc, dl, VT, Op0, Op1,
21880 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
21881 }
21882
21883 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
21884 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
21885 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
21886 SDValue BC0 = peekThroughBitcasts(Op0);
21887 if (BC0.getOpcode() == ISD::AND) {
21888 APInt UndefElts;
21889 SmallVector<APInt, 64> EltBits;
21890 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
21891 VT.getScalarSizeInBits(), UndefElts,
21892 EltBits, false, false)) {
21893 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
21894 Cond = ISD::SETEQ;
21895 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
21896 }
21897 }
21898 }
21899 }
21900
21901 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
21902 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
21903 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
21904 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
21905 if (C1 && C1->getAPIntValue().isPowerOf2()) {
21906 unsigned BitWidth = VT.getScalarSizeInBits();
21907 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
21908
21909 SDValue Result = Op0.getOperand(0);
21910 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
21911 DAG.getConstant(ShiftAmt, dl, VT));
21912 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
21913 DAG.getConstant(BitWidth - 1, dl, VT));
21914 return Result;
21915 }
21916 }
21917
21918 // Break 256-bit integer vector compare into smaller ones.
21919 if (VT.is256BitVector() && !Subtarget.hasInt256())
21920 return Lower256IntVSETCC(Op, DAG);
21921
21922 // If this is a SETNE against the signed minimum value, change it to SETGT.
21923 // If this is a SETNE against the signed maximum value, change it to SETLT.
21924 // which will be swapped to SETGT.
21925 // Otherwise we use PCMPEQ+invert.
21926 APInt ConstValue;
21927 if (Cond == ISD::SETNE &&
21928 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
21929 if (ConstValue.isMinSignedValue())
21930 Cond = ISD::SETGT;
21931 else if (ConstValue.isMaxSignedValue())
21932 Cond = ISD::SETLT;
21933 }
21934
21935 // If both operands are known non-negative, then an unsigned compare is the
21936 // same as a signed compare and there's no need to flip signbits.
21937 // TODO: We could check for more general simplifications here since we're
21938 // computing known bits.
21939 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
21940 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
21941
21942 // Special case: Use min/max operations for unsigned compares.
21943 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21944 if (ISD::isUnsignedIntSetCC(Cond) &&
21945 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
21946 TLI.isOperationLegal(ISD::UMIN, VT)) {
21947 // If we have a constant operand, increment/decrement it and change the
21948 // condition to avoid an invert.
21949 if (Cond == ISD::SETUGT) {
21950 // X > C --> X >= (C+1) --> X == umax(X, C+1)
21951 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
21952 Op1 = UGTOp1;
21953 Cond = ISD::SETUGE;
21954 }
21955 }
21956 if (Cond == ISD::SETULT) {
21957 // X < C --> X <= (C-1) --> X == umin(X, C-1)
21958 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
21959 Op1 = ULTOp1;
21960 Cond = ISD::SETULE;
21961 }
21962 }
21963 bool Invert = false;
21964 unsigned Opc;
21965 switch (Cond) {
21966 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21966)
;
21967 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21968 case ISD::SETULE: Opc = ISD::UMIN; break;
21969 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21970 case ISD::SETUGE: Opc = ISD::UMAX; break;
21971 }
21972
21973 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
21974 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
21975
21976 // If the logical-not of the result is required, perform that now.
21977 if (Invert)
21978 Result = DAG.getNOT(dl, Result, VT);
21979
21980 return Result;
21981 }
21982
21983 // Try to use SUBUS and PCMPEQ.
21984 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
21985 return V;
21986
21987 // We are handling one of the integer comparisons here. Since SSE only has
21988 // GT and EQ comparisons for integer, swapping operands and multiple
21989 // operations may be required for some comparisons.
21990 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
21991 : X86ISD::PCMPGT;
21992 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
21993 Cond == ISD::SETGE || Cond == ISD::SETUGE;
21994 bool Invert = Cond == ISD::SETNE ||
21995 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
21996
21997 if (Swap)
21998 std::swap(Op0, Op1);
21999
22000 // Check that the operation in question is available (most are plain SSE2,
22001 // but PCMPGTQ and PCMPEQQ have different requirements).
22002 if (VT == MVT::v2i64) {
22003 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
22004 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22004, __PRETTY_FUNCTION__))
;
22005
22006 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
22007 // the odd elements over the even elements.
22008 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
22009 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
22010 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22011
22012 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22013 static const int MaskHi[] = { 1, 1, 3, 3 };
22014 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22015
22016 return DAG.getBitcast(VT, Result);
22017 }
22018
22019 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
22020 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22021 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
22022
22023 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22024 static const int MaskHi[] = { 1, 1, 3, 3 };
22025 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22026
22027 return DAG.getBitcast(VT, Result);
22028 }
22029
22030 // Since SSE has no unsigned integer comparisons, we need to flip the sign
22031 // bits of the inputs before performing those operations. The lower
22032 // compare is always unsigned.
22033 SDValue SB;
22034 if (FlipSigns) {
22035 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
22036 } else {
22037 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
22038 }
22039 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
22040 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
22041
22042 // Cast everything to the right type.
22043 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22044 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22045
22046 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
22047 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22048 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
22049
22050 // Create masks for only the low parts/high parts of the 64 bit integers.
22051 static const int MaskHi[] = { 1, 1, 3, 3 };
22052 static const int MaskLo[] = { 0, 0, 2, 2 };
22053 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
22054 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
22055 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22056
22057 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
22058 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
22059
22060 if (Invert)
22061 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22062
22063 return DAG.getBitcast(VT, Result);
22064 }
22065
22066 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
22067 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
22068 // pcmpeqd + pshufd + pand.
22069 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22069, __PRETTY_FUNCTION__))
;
22070
22071 // First cast everything to the right type.
22072 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22073 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22074
22075 // Do the compare.
22076 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
22077
22078 // Make sure the lower and upper halves are both all-ones.
22079 static const int Mask[] = { 1, 0, 3, 2 };
22080 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
22081 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
22082
22083 if (Invert)
22084 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22085
22086 return DAG.getBitcast(VT, Result);
22087 }
22088 }
22089
22090 // Since SSE has no unsigned integer comparisons, we need to flip the sign
22091 // bits of the inputs before performing those operations.
22092 if (FlipSigns) {
22093 MVT EltVT = VT.getVectorElementType();
22094 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
22095 VT);
22096 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
22097 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
22098 }
22099
22100 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
22101
22102 // If the logical-not of the result is required, perform that now.
22103 if (Invert)
22104 Result = DAG.getNOT(dl, Result, VT);
22105
22106 return Result;
22107}
22108
22109// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
22110static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
22111 const SDLoc &dl, SelectionDAG &DAG,
22112 const X86Subtarget &Subtarget,
22113 SDValue &X86CC) {
22114 // Only support equality comparisons.
22115 if (CC != ISD::SETEQ && CC != ISD::SETNE)
22116 return SDValue();
22117
22118 // Must be a bitcast from vXi1.
22119 if (Op0.getOpcode() != ISD::BITCAST)
22120 return SDValue();
22121
22122 Op0 = Op0.getOperand(0);
22123 MVT VT = Op0.getSimpleValueType();
22124 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
22125 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
22126 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
22127 return SDValue();
22128
22129 X86::CondCode X86Cond;
22130 if (isNullConstant(Op1)) {
22131 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
22132 } else if (isAllOnesConstant(Op1)) {
22133 // C flag is set for all ones.
22134 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
22135 } else
22136 return SDValue();
22137
22138 // If the input is an AND, we can combine it's operands into the KTEST.
22139 bool KTestable = false;
22140 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
22141 KTestable = true;
22142 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
22143 KTestable = true;
22144 if (!isNullConstant(Op1))
22145 KTestable = false;
22146 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
22147 SDValue LHS = Op0.getOperand(0);
22148 SDValue RHS = Op0.getOperand(1);
22149 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22150 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
22151 }
22152
22153 // If the input is an OR, we can combine it's operands into the KORTEST.
22154 SDValue LHS = Op0;
22155 SDValue RHS = Op0;
22156 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
22157 LHS = Op0.getOperand(0);
22158 RHS = Op0.getOperand(1);
22159 }
22160
22161 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22162 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
22163}
22164
22165/// Emit flags for the given setcc condition and operands. Also returns the
22166/// corresponding X86 condition code constant in X86CC.
22167SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
22168 ISD::CondCode CC, const SDLoc &dl,
22169 SelectionDAG &DAG,
22170 SDValue &X86CC) const {
22171 // Optimize to BT if possible.
22172 // Lower (X & (1 << N)) == 0 to BT(X, N).
22173 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
22174 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
22175 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
22176 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22177 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
22178 return BT;
22179 }
22180
22181 // Try to use PTEST for a tree ORs equality compared with 0.
22182 // TODO: We could do AND tree with all 1s as well by using the C flag.
22183 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
22184 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22185 if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
22186 return PTEST;
22187 }
22188
22189 // Try to lower using KORTEST or KTEST.
22190 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
22191 return Test;
22192
22193 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
22194 // these.
22195 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
22196 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22197 // If the input is a setcc, then reuse the input setcc or use a new one with
22198 // the inverted condition.
22199 if (Op0.getOpcode() == X86ISD::SETCC) {
22200 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
22201
22202 X86CC = Op0.getOperand(0);
22203 if (Invert) {
22204 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
22205 CCode = X86::GetOppositeBranchCondition(CCode);
22206 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
22207 }
22208
22209 return Op0.getOperand(1);
22210 }
22211 }
22212
22213 // Try to use the carry flag from the add in place of an separate CMP for:
22214 // (seteq (add X, -1), -1). Similar for setne.
22215 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
22216 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22217 if (isProfitableToUseFlagOp(Op0)) {
22218 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
22219
22220 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
22221 Op0.getOperand(1));
22222 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
22223 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22224 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
22225 return SDValue(New.getNode(), 1);
22226 }
22227 }
22228
22229 X86::CondCode CondCode =
22230 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
22231 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((CondCode != X86::COND_INVALID && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22231, __PRETTY_FUNCTION__))
;
22232
22233 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
22234 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
22235 return EFLAGS;
22236}
22237
22238SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
22239
22240 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22241 Op.getOpcode() == ISD::STRICT_FSETCCS;
22242 MVT VT = Op->getSimpleValueType(0);
22243
22244 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
22245
22246 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22246, __PRETTY_FUNCTION__))
;
22247 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22248 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22249 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22250 SDLoc dl(Op);
22251 ISD::CondCode CC =
22252 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
22253
22254 // Handle f128 first, since one possible outcome is a normal integer
22255 // comparison which gets handled by emitFlagsForSetcc.
22256 if (Op0.getValueType() == MVT::f128) {
22257 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
22258 Op.getOpcode() == ISD::STRICT_FSETCCS);
22259
22260 // If softenSetCCOperands returned a scalar, use it.
22261 if (!Op1.getNode()) {
22262 assert(Op0.getValueType() == Op.getValueType() &&((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22263, __PRETTY_FUNCTION__))
22263 "Unexpected setcc expansion!")((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22263, __PRETTY_FUNCTION__))
;
22264 if (IsStrict)
22265 return DAG.getMergeValues({Op0, Chain}, dl);
22266 return Op0;
22267 }
22268 }
22269
22270 if (Op0.getSimpleValueType().isInteger()) {
22271 SDValue X86CC;
22272 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
22273 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
22274 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
22275 }
22276
22277 // Handle floating point.
22278 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
22279 if (CondCode == X86::COND_INVALID)
22280 return SDValue();
22281
22282 SDValue EFLAGS;
22283 if (IsStrict) {
22284 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22285 EFLAGS =
22286 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
22287 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
22288 Chain = EFLAGS.getValue(1);
22289 } else {
22290 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
22291 }
22292
22293 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
22294 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
22295 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
22296}
22297
22298SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
22299 SDValue LHS = Op.getOperand(0);
22300 SDValue RHS = Op.getOperand(1);
22301 SDValue Carry = Op.getOperand(2);
22302 SDValue Cond = Op.getOperand(3);
22303 SDLoc DL(Op);
22304
22305 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."
) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22305, __PRETTY_FUNCTION__))
;
22306 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
22307
22308 // Recreate the carry if needed.
22309 EVT CarryVT = Carry.getValueType();
22310 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
22311 Carry, DAG.getAllOnesConstant(DL, CarryVT));
22312
22313 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
22314 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
22315 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
22316}
22317
22318// This function returns three things: the arithmetic computation itself
22319// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
22320// flag and the condition code define the case in which the arithmetic
22321// computation overflows.
22322static std::pair<SDValue, SDValue>
22323getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
22324 assert(Op.getResNo() == 0 && "Unexpected result number!")((Op.getResNo() == 0 && "Unexpected result number!") ?
static_cast<void> (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22324, __PRETTY_FUNCTION__))
;
22325 SDValue Value, Overflow;
22326 SDValue LHS = Op.getOperand(0);
22327 SDValue RHS = Op.getOperand(1);
22328 unsigned BaseOp = 0;
22329 SDLoc DL(Op);
22330 switch (Op.getOpcode()) {
22331 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22331)
;
22332 case ISD::SADDO:
22333 BaseOp = X86ISD::ADD;
22334 Cond = X86::COND_O;
22335 break;
22336 case ISD::UADDO:
22337 BaseOp = X86ISD::ADD;
22338 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
22339 break;
22340 case ISD::SSUBO:
22341 BaseOp = X86ISD::SUB;
22342 Cond = X86::COND_O;
22343 break;
22344 case ISD::USUBO:
22345 BaseOp = X86ISD::SUB;
22346 Cond = X86::COND_B;
22347 break;
22348 case ISD::SMULO:
22349 BaseOp = X86ISD::SMUL;
22350 Cond = X86::COND_O;
22351 break;
22352 case ISD::UMULO:
22353 BaseOp = X86ISD::UMUL;
22354 Cond = X86::COND_O;
22355 break;
22356 }
22357
22358 if (BaseOp) {
22359 // Also sets EFLAGS.
22360 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22361 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22362 Overflow = Value.getValue(1);
22363 }
22364
22365 return std::make_pair(Value, Overflow);
22366}
22367
22368static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22369 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22370 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22371 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22372 // has only one use.
22373 SDLoc DL(Op);
22374 X86::CondCode Cond;
22375 SDValue Value, Overflow;
22376 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
22377
22378 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
22379 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((Op->getValueType(1) == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22379, __PRETTY_FUNCTION__))
;
22380 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
22381}
22382
22383/// Return true if opcode is a X86 logical comparison.
22384static bool isX86LogicalCmp(SDValue Op) {
22385 unsigned Opc = Op.getOpcode();
22386 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
22387 Opc == X86ISD::FCMP)
22388 return true;
22389 if (Op.getResNo() == 1 &&
22390 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
22391 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
22392 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
22393 return true;
22394
22395 return false;
22396}
22397
22398static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
22399 if (V.getOpcode() != ISD::TRUNCATE)
22400 return false;
22401
22402 SDValue VOp0 = V.getOperand(0);
22403 unsigned InBits = VOp0.getValueSizeInBits();
22404 unsigned Bits = V.getValueSizeInBits();
22405 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
22406}
22407
22408SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
22409 bool AddTest = true;
22410 SDValue Cond = Op.getOperand(0);
22411 SDValue Op1 = Op.getOperand(1);
22412 SDValue Op2 = Op.getOperand(2);
22413 SDLoc DL(Op);
22414 MVT VT = Op1.getSimpleValueType();
22415 SDValue CC;
22416
22417 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
22418 // are available or VBLENDV if AVX is available.
22419 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
22420 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
22421 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
22422 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
22423 bool IsAlwaysSignaling;
22424 unsigned SSECC =
22425 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
22426 CondOp0, CondOp1, IsAlwaysSignaling);
22427
22428 if (Subtarget.hasAVX512()) {
22429 SDValue Cmp =
22430 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
22431 DAG.getTargetConstant(SSECC, DL, MVT::i8));
22432 assert(!VT.isVector() && "Not a scalar type?")((!VT.isVector() && "Not a scalar type?") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22432, __PRETTY_FUNCTION__))
;
22433 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
22434 }
22435
22436 if (SSECC < 8 || Subtarget.hasAVX()) {
22437 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
22438 DAG.getTargetConstant(SSECC, DL, MVT::i8));
22439
22440 // If we have AVX, we can use a variable vector select (VBLENDV) instead
22441 // of 3 logic instructions for size savings and potentially speed.
22442 // Unfortunately, there is no scalar form of VBLENDV.
22443
22444 // If either operand is a +0.0 constant, don't try this. We can expect to
22445 // optimize away at least one of the logic instructions later in that
22446 // case, so that sequence would be faster than a variable blend.
22447
22448 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
22449 // uses XMM0 as the selection register. That may need just as many
22450 // instructions as the AND/ANDN/OR sequence due to register moves, so
22451 // don't bother.
22452 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
22453 !isNullFPConstant(Op2)) {
22454 // Convert to vectors, do a VSELECT, and convert back to scalar.
22455 // All of the conversions should be optimized away.
22456 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
22457 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
22458 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
22459 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
22460
22461 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
22462 VCmp = DAG.getBitcast(VCmpVT, VCmp);
22463
22464 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
22465
22466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
22467 VSel, DAG.getIntPtrConstant(0, DL));
22468 }
22469 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
22470 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
22471 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
22472 }
22473 }
22474
22475 // AVX512 fallback is to lower selects of scalar floats to masked moves.
22476 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
22477 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
22478 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
22479 }
22480
22481 if (Cond.getOpcode() == ISD::SETCC) {
22482 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
22483 Cond = NewCond;
22484 // If the condition was updated, it's possible that the operands of the
22485 // select were also updated (for example, EmitTest has a RAUW). Refresh
22486 // the local references to the select operands in case they got stale.
22487 Op1 = Op.getOperand(1);
22488 Op2 = Op.getOperand(2);
22489 }
22490 }
22491
22492 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
22493 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
22494 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
22495 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
22496 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
22497 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
22498 if (Cond.getOpcode() == X86ISD::SETCC &&
22499 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
22500 isNullConstant(Cond.getOperand(1).getOperand(1))) {
22501 SDValue Cmp = Cond.getOperand(1);
22502 unsigned CondCode = Cond.getConstantOperandVal(0);
22503
22504 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
22505 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
22506 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
22507 SDValue CmpOp0 = Cmp.getOperand(0);
22508
22509 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22510 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
22511
22512 // Apply further optimizations for special cases
22513 // (select (x != 0), -1, 0) -> neg & sbb
22514 // (select (x == 0), 0, -1) -> neg & sbb
22515 if (isNullConstant(Y) &&
22516 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
22517 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
22518 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
22519 Zero = DAG.getConstant(0, DL, Op.getValueType());
22520 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
22521 }
22522
22523 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
22524 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
22525
22526 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
22527 SDValue Res = // Res = 0 or -1.
22528 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
22529
22530 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
22531 Res = DAG.getNOT(DL, Res, Res.getValueType());
22532
22533 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
22534 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
22535 Cmp.getOperand(0).getOpcode() == ISD::AND &&
22536 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
22537 SDValue CmpOp0 = Cmp.getOperand(0);
22538 SDValue Src1, Src2;
22539 // true if Op2 is XOR or OR operator and one of its operands
22540 // is equal to Op1
22541 // ( a , a op b) || ( b , a op b)
22542 auto isOrXorPattern = [&]() {
22543 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
22544 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
22545 Src1 =
22546 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
22547 Src2 = Op1;
22548 return true;
22549 }
22550 return false;
22551 };
22552
22553 if (isOrXorPattern()) {
22554 SDValue Neg;
22555 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
22556 // we need mask of all zeros or ones with same size of the other
22557 // operands.
22558 if (CmpSz > VT.getSizeInBits())
22559 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
22560 else if (CmpSz < VT.getSizeInBits())
22561 Neg = DAG.getNode(ISD::AND, DL, VT,
22562 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
22563 DAG.getConstant(1, DL, VT));
22564 else
22565 Neg = CmpOp0;
22566 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
22567 Neg); // -(and (x, 0x1))
22568 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
22569 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
22570 }
22571 }
22572 }
22573
22574 // Look past (and (setcc_carry (cmp ...)), 1).
22575 if (Cond.getOpcode() == ISD::AND &&
22576 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
22577 isOneConstant(Cond.getOperand(1)))
22578 Cond = Cond.getOperand(0);
22579
22580 // If condition flag is set by a X86ISD::CMP, then use it as the condition
22581 // setting operand in place of the X86ISD::SETCC.
22582 unsigned CondOpcode = Cond.getOpcode();
22583 if (CondOpcode == X86ISD::SETCC ||
22584 CondOpcode == X86ISD::SETCC_CARRY) {
22585 CC = Cond.getOperand(0);
22586
22587 SDValue Cmp = Cond.getOperand(1);
22588 bool IllegalFPCMov = false;
22589 if (VT.isFloatingPoint() && !VT.isVector() &&
22590 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
22591 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
22592
22593 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
22594 Cmp.getOpcode() == X86ISD::BT) { // FIXME
22595 Cond = Cmp;
22596 AddTest = false;
22597 }
22598 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
22599 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
22600 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
22601 SDValue Value;
22602 X86::CondCode X86Cond;
22603 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
22604
22605 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
22606 AddTest = false;
22607 }
22608
22609 if (AddTest) {
22610 // Look past the truncate if the high bits are known zero.
22611 if (isTruncWithZeroHighBitsInput(Cond, DAG))
22612 Cond = Cond.getOperand(0);
22613
22614 // We know the result of AND is compared against zero. Try to match
22615 // it to BT.
22616 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
22617 SDValue BTCC;
22618 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
22619 CC = BTCC;
22620 Cond = BT;
22621 AddTest = false;
22622 }
22623 }
22624 }
22625
22626 if (AddTest) {
22627 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
22628 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
22629 }
22630
22631 // a < b ? -1 : 0 -> RES = ~setcc_carry
22632 // a < b ? 0 : -1 -> RES = setcc_carry
22633 // a >= b ? -1 : 0 -> RES = setcc_carry
22634 // a >= b ? 0 : -1 -> RES = ~setcc_carry
22635 if (Cond.getOpcode() == X86ISD::SUB) {
22636 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
22637
22638 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
22639 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
22640 (isNullConstant(Op1) || isNullConstant(Op2))) {
22641 SDValue Res =
22642 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
22643 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
22644 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
22645 return DAG.getNOT(DL, Res, Res.getValueType());
22646 return Res;
22647 }
22648 }
22649
22650 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
22651 // widen the cmov and push the truncate through. This avoids introducing a new
22652 // branch during isel and doesn't add any extensions.
22653 if (Op.getValueType() == MVT::i8 &&
22654 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
22655 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
22656 if (T1.getValueType() == T2.getValueType() &&
22657 // Blacklist CopyFromReg to avoid partial register stalls.
22658 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
22659 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
22660 CC, Cond);
22661 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
22662 }
22663 }
22664
22665 // Or finally, promote i8 cmovs if we have CMOV,
22666 // or i16 cmovs if it won't prevent folding a load.
22667 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
22668 // legal, but EmitLoweredSelect() can not deal with these extensions
22669 // being inserted between two CMOV's. (in i16 case too TBN)
22670 // https://bugs.llvm.org/show_bug.cgi?id=40974
22671 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
22672 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
22673 !MayFoldLoad(Op2))) {
22674 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
22675 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
22676 SDValue Ops[] = { Op2, Op1, CC, Cond };
22677 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
22678 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
22679 }
22680
22681 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
22682 // condition is true.
22683 SDValue Ops[] = { Op2, Op1, CC, Cond };
22684 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
22685}
22686
22687static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
22688 const X86Subtarget &Subtarget,
22689 SelectionDAG &DAG) {
22690 MVT VT = Op->getSimpleValueType(0);
22691 SDValue In = Op->getOperand(0);
22692 MVT InVT = In.getSimpleValueType();
22693 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22693, __PRETTY_FUNCTION__))
;
22694 MVT VTElt = VT.getVectorElementType();
22695 SDLoc dl(Op);
22696
22697 unsigned NumElts = VT.getVectorNumElements();
22698
22699 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
22700 MVT ExtVT = VT;
22701 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
22702 // If v16i32 is to be avoided, we'll need to split and concatenate.
22703 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22704 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
22705
22706 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22707 }
22708
22709 // Widen to 512-bits if VLX is not supported.
22710 MVT WideVT = ExtVT;
22711 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22712 NumElts *= 512 / ExtVT.getSizeInBits();
22713 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22714 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
22715 In, DAG.getIntPtrConstant(0, dl));
22716 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
22717 }
22718
22719 SDValue V;
22720 MVT WideEltVT = WideVT.getVectorElementType();
22721 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
22722 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
22723 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
22724 } else {
22725 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
22726 SDValue Zero = DAG.getConstant(0, dl, WideVT);
22727 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
22728 }
22729
22730 // Truncate if we had to extend i16/i8 above.
22731 if (VT != ExtVT) {
22732 WideVT = MVT::getVectorVT(VTElt, NumElts);
22733 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
22734 }
22735
22736 // Extract back to 128/256-bit if we widened.
22737 if (WideVT != VT)
22738 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
22739 DAG.getIntPtrConstant(0, dl));
22740
22741 return V;
22742}
22743
22744static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22745 SelectionDAG &DAG) {
22746 SDValue In = Op->getOperand(0);
22747 MVT InVT = In.getSimpleValueType();
22748
22749 if (InVT.getVectorElementType() == MVT::i1)
22750 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
22751
22752 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22752, __PRETTY_FUNCTION__))
;
22753 return LowerAVXExtend(Op, DAG, Subtarget);
22754}
22755
22756// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
22757// For sign extend this needs to handle all vector sizes and SSE4.1 and
22758// non-SSE4.1 targets. For zero extend this should only handle inputs of
22759// MVT::v64i8 when BWI is not supported, but AVX512 is.
22760static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
22761 const X86Subtarget &Subtarget,
22762 SelectionDAG &DAG) {
22763 SDValue In = Op->getOperand(0);
22764 MVT VT = Op->getSimpleValueType(0);
22765 MVT InVT = In.getSimpleValueType();
22766
22767 MVT SVT = VT.getVectorElementType();
22768 MVT InSVT = InVT.getVectorElementType();
22769 assert(SVT.getSizeInBits() > InSVT.getSizeInBits())((SVT.getSizeInBits() > InSVT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22769, __PRETTY_FUNCTION__))
;
22770
22771 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
22772 return SDValue();
22773 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
22774 return SDValue();
22775 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
22776 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
22777 !(VT.is512BitVector() && Subtarget.hasAVX512()))
22778 return SDValue();
22779
22780 SDLoc dl(Op);
22781 unsigned Opc = Op.getOpcode();
22782 unsigned NumElts = VT.getVectorNumElements();
22783
22784 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
22785 // For 512-bit vectors, we need 128-bits or 256-bits.
22786 if (InVT.getSizeInBits() > 128) {
22787 // Input needs to be at least the same number of elements as output, and
22788 // at least 128-bits.
22789 int InSize = InSVT.getSizeInBits() * NumElts;
22790 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
22791 InVT = In.getSimpleValueType();
22792 }
22793
22794 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
22795 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
22796 // need to be handled here for 256/512-bit results.
22797 if (Subtarget.hasInt256()) {
22798 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22798, __PRETTY_FUNCTION__))
;
22799
22800 if (InVT.getVectorNumElements() != NumElts)
22801 return DAG.getNode(Op.getOpcode(), dl, VT, In);
22802
22803 // FIXME: Apparently we create inreg operations that could be regular
22804 // extends.
22805 unsigned ExtOpc =
22806 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
22807 : ISD::ZERO_EXTEND;
22808 return DAG.getNode(ExtOpc, dl, VT, In);
22809 }
22810
22811 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
22812 if (Subtarget.hasAVX()) {
22813 assert(VT.is256BitVector() && "256-bit vector expected")((VT.is256BitVector() && "256-bit vector expected") ?
static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22813, __PRETTY_FUNCTION__))
;
22814 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22815 int HalfNumElts = HalfVT.getVectorNumElements();
22816
22817 unsigned NumSrcElts = InVT.getVectorNumElements();
22818 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
22819 for (int i = 0; i != HalfNumElts; ++i)
22820 HiMask[i] = HalfNumElts + i;
22821
22822 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
22823 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
22824 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
22825 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
22826 }
22827
22828 // We should only get here for sign extend.
22829 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22829, __PRETTY_FUNCTION__))
;
22830 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((VT.is128BitVector() && InVT.is128BitVector() &&
"Unexpected VTs") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22830, __PRETTY_FUNCTION__))
;
22831
22832 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
22833 SDValue Curr = In;
22834 SDValue SignExt = Curr;
22835
22836 // As SRAI is only available on i16/i32 types, we expand only up to i32
22837 // and handle i64 separately.
22838 if (InVT != MVT::v4i32) {
22839 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
22840
22841 unsigned DestWidth = DestVT.getScalarSizeInBits();
22842 unsigned Scale = DestWidth / InSVT.getSizeInBits();
22843
22844 unsigned InNumElts = InVT.getVectorNumElements();
22845 unsigned DestElts = DestVT.getVectorNumElements();
22846
22847 // Build a shuffle mask that takes each input element and places it in the
22848 // MSBs of the new element size.
22849 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
22850 for (unsigned i = 0; i != DestElts; ++i)
22851 Mask[i * Scale + (Scale - 1)] = i;
22852
22853 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
22854 Curr = DAG.getBitcast(DestVT, Curr);
22855
22856 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
22857 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
22858 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
22859 }
22860
22861 if (VT == MVT::v2i64) {
22862 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"
) ? static_cast<void> (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22862, __PRETTY_FUNCTION__))
;
22863 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
22864 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
22865 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
22866 SignExt = DAG.getBitcast(VT, SignExt);
22867 }
22868
22869 return SignExt;
22870}
22871
22872static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22873 SelectionDAG &DAG) {
22874 MVT VT = Op->getSimpleValueType(0);
22875 SDValue In = Op->getOperand(0);
22876 MVT InVT = In.getSimpleValueType();
22877 SDLoc dl(Op);
22878
22879 if (InVT.getVectorElementType() == MVT::i1)
22880 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
22881
22882 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22882, __PRETTY_FUNCTION__))
;
22883 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22884, __PRETTY_FUNCTION__))
22884 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22884, __PRETTY_FUNCTION__))
;
22885 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22888, __PRETTY_FUNCTION__))
22886 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22888, __PRETTY_FUNCTION__))
22887 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22888, __PRETTY_FUNCTION__))
22888 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22888, __PRETTY_FUNCTION__))
;
22889 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22892, __PRETTY_FUNCTION__))
22890 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22892, __PRETTY_FUNCTION__))
22891 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22892, __PRETTY_FUNCTION__))
22892 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22892, __PRETTY_FUNCTION__))
;
22893
22894 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
22895 if (InVT == MVT::v8i8) {
22896 if (VT != MVT::v8i64)
22897 return SDValue();
22898
22899 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
22900 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
22901 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
22902 }
22903
22904 if (Subtarget.hasInt256())
22905 return Op;
22906
22907 // Optimize vectors in AVX mode
22908 // Sign extend v8i16 to v8i32 and
22909 // v4i32 to v4i64
22910 //
22911 // Divide input vector into two parts
22912 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
22913 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
22914 // concat the vectors to original VT
22915 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22916 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
22917
22918 unsigned NumElems = InVT.getVectorNumElements();
22919 SmallVector<int,8> ShufMask(NumElems, -1);
22920 for (unsigned i = 0; i != NumElems/2; ++i)
22921 ShufMask[i] = i + NumElems/2;
22922
22923 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
22924 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
22925
22926 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22927}
22928
22929/// Change a vector store into a pair of half-size vector stores.
22930static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
22931 SDValue StoredVal = Store->getValue();
22932 assert((StoredVal.getValueType().is256BitVector() ||(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22934, __PRETTY_FUNCTION__))
22933 StoredVal.getValueType().is512BitVector()) &&(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22934, __PRETTY_FUNCTION__))
22934 "Expecting 256/512-bit op")(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22934, __PRETTY_FUNCTION__))
;
22935
22936 // Splitting volatile memory ops is not allowed unless the operation was not
22937 // legal to begin with. Assume the input store is legal (this transform is
22938 // only used for targets with AVX). Note: It is possible that we have an
22939 // illegal type like v2i128, and so we could allow splitting a volatile store
22940 // in that case if that is important.
22941 if (!Store->isSimple())
22942 return SDValue();
22943
22944 EVT StoreVT = StoredVal.getValueType();
22945 unsigned NumElems = StoreVT.getVectorNumElements();
22946 unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
22947 unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
22948
22949 SDLoc DL(Store);
22950 SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
22951 SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
22952 SDValue Ptr0 = Store->getBasePtr();
22953 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
22954 unsigned Alignment = Store->getAlignment();
22955 SDValue Ch0 =
22956 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
22957 Alignment, Store->getMemOperand()->getFlags());
22958 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
22959 Store->getPointerInfo().getWithOffset(HalfAlign),
22960 MinAlign(Alignment, HalfAlign),
22961 Store->getMemOperand()->getFlags());
22962 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
22963}
22964
22965/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
22966/// type.
22967static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
22968 SelectionDAG &DAG) {
22969 SDValue StoredVal = Store->getValue();
22970 assert(StoreVT.is128BitVector() &&((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22971, __PRETTY_FUNCTION__))
22971 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22971, __PRETTY_FUNCTION__))
;
22972 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
22973
22974 // Splitting volatile memory ops is not allowed unless the operation was not
22975 // legal to begin with. We are assuming the input op is legal (this transform
22976 // is only used for targets with AVX).
22977 if (!Store->isSimple())
22978 return SDValue();
22979
22980 MVT StoreSVT = StoreVT.getScalarType();
22981 unsigned NumElems = StoreVT.getVectorNumElements();
22982 unsigned ScalarSize = StoreSVT.getStoreSize();
22983 unsigned Alignment = Store->getAlignment();
22984
22985 SDLoc DL(Store);
22986 SmallVector<SDValue, 4> Stores;
22987 for (unsigned i = 0; i != NumElems; ++i) {
22988 unsigned Offset = i * ScalarSize;
22989 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
22990 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
22991 DAG.getIntPtrConstant(i, DL));
22992 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
22993 Store->getPointerInfo().getWithOffset(Offset),
22994 MinAlign(Alignment, Offset),
22995 Store->getMemOperand()->getFlags());
22996 Stores.push_back(Ch);
22997 }
22998 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
22999}
23000
23001static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
23002 SelectionDAG &DAG) {
23003 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
23004 SDLoc dl(St);
23005 SDValue StoredVal = St->getValue();
23006
23007 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
23008 if (StoredVal.getValueType().isVector() &&
23009 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
23010 assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23011, __PRETTY_FUNCTION__))
23011 "Unexpected VT")((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23011, __PRETTY_FUNCTION__))
;
23012 assert(!St->isTruncatingStore() && "Expected non-truncating store")((!St->isTruncatingStore() && "Expected non-truncating store"
) ? static_cast<void> (0) : __assert_fail ("!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23012, __PRETTY_FUNCTION__))
;
23013 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23014, __PRETTY_FUNCTION__))
23014 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23014, __PRETTY_FUNCTION__))
;
23015
23016 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
23017 DAG.getUNDEF(MVT::v16i1), StoredVal,
23018 DAG.getIntPtrConstant(0, dl));
23019 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
23020 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
23021
23022 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23023 St->getPointerInfo(), St->getAlignment(),
23024 St->getMemOperand()->getFlags());
23025 }
23026
23027 if (St->isTruncatingStore())
23028 return SDValue();
23029
23030 // If this is a 256-bit store of concatenated ops, we are better off splitting
23031 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
23032 // and each half can execute independently. Some cores would split the op into
23033 // halves anyway, so the concat (vinsertf128) is purely an extra op.
23034 MVT StoreVT = StoredVal.getSimpleValueType();
23035 if (StoreVT.is256BitVector()) {
23036 SmallVector<SDValue, 4> CatOps;
23037 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
23038 return splitVectorStore(St, DAG);
23039 return SDValue();
23040 }
23041
23042 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23043 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23044, __PRETTY_FUNCTION__))
23044 "Unexpected VT")((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23044, __PRETTY_FUNCTION__))
;
23045 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23046, __PRETTY_FUNCTION__))
23046 TargetLowering::TypeWidenVector && "Unexpected type action!")((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23046, __PRETTY_FUNCTION__))
;
23047
23048 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
23049 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
23050 DAG.getUNDEF(StoreVT));
23051
23052 if (Subtarget.hasSSE2()) {
23053 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
23054 // and store it.
23055 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
23056 MVT CastVT = MVT::getVectorVT(StVT, 2);
23057 StoredVal = DAG.getBitcast(CastVT, StoredVal);
23058 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
23059 DAG.getIntPtrConstant(0, dl));
23060
23061 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23062 St->getPointerInfo(), St->getAlignment(),
23063 St->getMemOperand()->getFlags());
23064 }
23065 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23065, __PRETTY_FUNCTION__))
;
23066 SDVTList Tys = DAG.getVTList(MVT::Other);
23067 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
23068 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
23069 St->getMemOperand());
23070}
23071
23072// Lower vector extended loads using a shuffle. If SSSE3 is not available we
23073// may emit an illegal shuffle but the expansion is still better than scalar
23074// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
23075// we'll emit a shuffle and a arithmetic shift.
23076// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
23077// TODO: It is possible to support ZExt by zeroing the undef values during
23078// the shuffle phase or after the shuffle.
23079static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
23080 SelectionDAG &DAG) {
23081 MVT RegVT = Op.getSimpleValueType();
23082 assert(RegVT.isVector() && "We only custom lower vector loads.")((RegVT.isVector() && "We only custom lower vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23082, __PRETTY_FUNCTION__))
;
23083 assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23084, __PRETTY_FUNCTION__))
23084 "We only custom lower integer vector loads.")((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23084, __PRETTY_FUNCTION__))
;
23085
23086 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
23087 SDLoc dl(Ld);
23088
23089 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
23090 if (RegVT.getVectorElementType() == MVT::i1) {
23091 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"
) ? static_cast<void> (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23091, __PRETTY_FUNCTION__))
;
23092 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((RegVT.getVectorNumElements() <= 8 && "Unexpected VT"
) ? static_cast<void> (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23092, __PRETTY_FUNCTION__))
;
23093 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23094, __PRETTY_FUNCTION__))
23094 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23094, __PRETTY_FUNCTION__))
;
23095
23096 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
23097 Ld->getPointerInfo(), Ld->getAlignment(),
23098 Ld->getMemOperand()->getFlags());
23099
23100 // Replace chain users with the new chain.
23101 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((NewLd->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23101, __PRETTY_FUNCTION__))
;
23102
23103 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
23104 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
23105 DAG.getBitcast(MVT::v16i1, Val),
23106 DAG.getIntPtrConstant(0, dl));
23107 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
23108 }
23109
23110 return SDValue();
23111}
23112
23113/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
23114/// each of which has no other use apart from the AND / OR.
23115static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
23116 Opc = Op.getOpcode();
23117 if (Opc != ISD::OR && Opc != ISD::AND)
23118 return false;
23119 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
23120 Op.getOperand(0).hasOneUse() &&
23121 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
23122 Op.getOperand(1).hasOneUse());
23123}
23124
23125SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
23126 SDValue Chain = Op.getOperand(0);
23127 SDValue Cond = Op.getOperand(1);
23128 SDValue Dest = Op.getOperand(2);
23129 SDLoc dl(Op);
23130
23131 if (Cond.getOpcode() == ISD::SETCC &&
23132 Cond.getOperand(0).getValueType() != MVT::f128) {
23133 SDValue LHS = Cond.getOperand(0);
23134 SDValue RHS = Cond.getOperand(1);
23135 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23136
23137 // Special case for
23138 // setcc([su]{add,sub,mul}o == 0)
23139 // setcc([su]{add,sub,mul}o != 1)
23140 if (ISD::isOverflowIntrOpRes(LHS) &&
23141 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
23142 (isNullConstant(RHS) || isOneConstant(RHS))) {
23143 SDValue Value, Overflow;
23144 X86::CondCode X86Cond;
23145 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
23146
23147 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
23148 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
23149
23150 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23151 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23152 Overflow);
23153 }
23154
23155 if (LHS.getSimpleValueType().isInteger()) {
23156 SDValue CCVal;
23157 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
23158 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23159 EFLAGS);
23160 }
23161
23162 if (CC == ISD::SETOEQ) {
23163 // For FCMP_OEQ, we can emit
23164 // two branches instead of an explicit AND instruction with a
23165 // separate test. However, we only do this if this block doesn't
23166 // have a fall-through edge, because this requires an explicit
23167 // jmp when the condition is false.
23168 if (Op.getNode()->hasOneUse()) {
23169 SDNode *User = *Op.getNode()->use_begin();
23170 // Look for an unconditional branch following this conditional branch.
23171 // We need this because we need to reverse the successors in order
23172 // to implement FCMP_OEQ.
23173 if (User->getOpcode() == ISD::BR) {
23174 SDValue FalseBB = User->getOperand(1);
23175 SDNode *NewBR =
23176 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
23177 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23177, __PRETTY_FUNCTION__))
;
23178 (void)NewBR;
23179 Dest = FalseBB;
23180
23181 SDValue Cmp =
23182 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23183 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
23184 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
23185 CCVal, Cmp);
23186 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
23187 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23188 Cmp);
23189 }
23190 }
23191 } else if (CC == ISD::SETUNE) {
23192 // For FCMP_UNE, we can emit
23193 // two branches instead of an explicit OR instruction with a
23194 // separate test.
23195 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23196 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
23197 Chain =
23198 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
23199 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
23200 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23201 Cmp);
23202 } else {
23203 X86::CondCode X86Cond =
23204 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
23205 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23206 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23207 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23208 Cmp);
23209 }
23210 }
23211
23212 if (ISD::isOverflowIntrOpRes(Cond)) {
23213 SDValue Value, Overflow;
23214 X86::CondCode X86Cond;
23215 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23216
23217 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23218 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23219 Overflow);
23220 }
23221
23222 // Look past the truncate if the high bits are known zero.
23223 if (isTruncWithZeroHighBitsInput(Cond, DAG))
23224 Cond = Cond.getOperand(0);
23225
23226 EVT CondVT = Cond.getValueType();
23227
23228 // Add an AND with 1 if we don't already have one.
23229 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
23230 Cond =
23231 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
23232
23233 SDValue LHS = Cond;
23234 SDValue RHS = DAG.getConstant(0, dl, CondVT);
23235
23236 SDValue CCVal;
23237 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
23238 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23239 EFLAGS);
23240}
23241
23242// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
23243// Calls to _alloca are needed to probe the stack when allocating more than 4k
23244// bytes in one go. Touching the stack at 4K increments is necessary to ensure
23245// that the guard pages used by the OS virtual memory manager are allocated in
23246// correct sequence.
23247SDValue
23248X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
23249 SelectionDAG &DAG) const {
23250 MachineFunction &MF = DAG.getMachineFunction();
23251 bool SplitStack = MF.shouldSplitStack();
23252 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
23253 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
23254 SplitStack || EmitStackProbeCall;
23255 SDLoc dl(Op);
23256
23257 // Get the inputs.
23258 SDNode *Node = Op.getNode();
23259 SDValue Chain = Op.getOperand(0);
23260 SDValue Size = Op.getOperand(1);
23261 MaybeAlign Alignment(Op.getConstantOperandVal(2));
23262 EVT VT = Node->getValueType(0);
23263
23264 // Chain the dynamic stack allocation so that it doesn't modify the stack
23265 // pointer when other instructions are using the stack.
23266 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
23267
23268 bool Is64Bit = Subtarget.is64Bit();
23269 MVT SPTy = getPointerTy(DAG.getDataLayout());
23270
23271 SDValue Result;
23272 if (!Lower) {
23273 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23274 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
23275 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23276, __PRETTY_FUNCTION__))
23276 " not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23276, __PRETTY_FUNCTION__))
;
23277
23278 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
23279 const Align StackAlign(TFI.getStackAlignment());
23280 if (hasInlineStackProbe(MF)) {
23281 MachineRegisterInfo &MRI = MF.getRegInfo();
23282
23283 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
23284 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
23285 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
23286 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
23287 DAG.getRegister(Vreg, SPTy));
23288 } else {
23289 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
23290 Chain = SP.getValue(1);
23291 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
23292 }
23293 if (Alignment && Alignment > StackAlign)
23294 Result =
23295 DAG.getNode(ISD::AND, dl, VT, Result,
23296 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
23297 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
23298 } else if (SplitStack) {
23299 MachineRegisterInfo &MRI = MF.getRegInfo();
23300
23301 if (Is64Bit) {
23302 // The 64 bit implementation of segmented stacks needs to clobber both r10
23303 // r11. This makes it impossible to use it along with nested parameters.
23304 const Function &F = MF.getFunction();
23305 for (const auto &A : F.args()) {
23306 if (A.hasNestAttr())
23307 report_fatal_error("Cannot use segmented stacks with functions that "
23308 "have nested arguments.");
23309 }
23310 }
23311
23312 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
23313 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
23314 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
23315 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
23316 DAG.getRegister(Vreg, SPTy));
23317 } else {
23318 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
23319 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
23320 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
23321
23322 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23323 Register SPReg = RegInfo->getStackRegister();
23324 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
23325 Chain = SP.getValue(1);
23326
23327 if (Alignment) {
23328 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
23329 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
23330 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
23331 }
23332
23333 Result = SP;
23334 }
23335
23336 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
23337 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
23338
23339 SDValue Ops[2] = {Result, Chain};
23340 return DAG.getMergeValues(Ops, dl);
23341}
23342
23343SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
23344 MachineFunction &MF = DAG.getMachineFunction();
23345 auto PtrVT = getPointerTy(MF.getDataLayout());
23346 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
23347
23348 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
23349 SDLoc DL(Op);
23350
23351 if (!Subtarget.is64Bit() ||
23352 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
23353 // vastart just stores the address of the VarArgsFrameIndex slot into the
23354 // memory location argument.
23355 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
23356 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
23357 MachinePointerInfo(SV));
23358 }
23359
23360 // __va_list_tag:
23361 // gp_offset (0 - 6 * 8)
23362 // fp_offset (48 - 48 + 8 * 16)
23363 // overflow_arg_area (point to parameters coming in memory).
23364 // reg_save_area
23365 SmallVector<SDValue, 8> MemOps;
23366 SDValue FIN = Op.getOperand(1);
23367 // Store gp_offset
23368 SDValue Store = DAG.getStore(
23369 Op.getOperand(0), DL,
23370 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
23371 MachinePointerInfo(SV));
23372 MemOps.push_back(Store);
23373
23374 // Store fp_offset
23375 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
23376 Store = DAG.getStore(
23377 Op.getOperand(0), DL,
23378 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
23379 MachinePointerInfo(SV, 4));
23380 MemOps.push_back(Store);
23381
23382 // Store ptr to overflow_arg_area
23383 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
23384 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
23385 Store =
23386 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
23387 MemOps.push_back(Store);
23388
23389 // Store ptr to reg_save_area.
23390 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
23391 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
23392 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
23393 Store = DAG.getStore(
23394 Op.getOperand(0), DL, RSFIN, FIN,
23395 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
23396 MemOps.push_back(Store);
23397 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
23398}
23399
23400SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
23401 assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23402, __PRETTY_FUNCTION__))
23402 "LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23402, __PRETTY_FUNCTION__))
;
23403 assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail
("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23403, __PRETTY_FUNCTION__))
;
23404
23405 MachineFunction &MF = DAG.getMachineFunction();
23406 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
23407 // The Win64 ABI uses char* instead of a structure.
23408 return DAG.expandVAArg(Op.getNode());
23409
23410 SDValue Chain = Op.getOperand(0);
23411 SDValue SrcPtr = Op.getOperand(1);
23412 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
23413 unsigned Align = Op.getConstantOperandVal(3);
23414 SDLoc dl(Op);
23415
23416 EVT ArgVT = Op.getNode()->getValueType(0);
23417 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23418 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
23419 uint8_t ArgMode;
23420
23421 // Decide which area this value should be read from.
23422 // TODO: Implement the AMD64 ABI in its entirety. This simple
23423 // selection mechanism works only for the basic types.
23424 if (ArgVT == MVT::f80) {
23425 llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23425)
;
23426 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
23427 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
23428 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
23429 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
23430 } else {
23431 llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23431)
;
23432 }
23433
23434 if (ArgMode == 2) {
23435 // Sanity Check: Make sure using fp_offset makes sense.
23436 assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23438, __PRETTY_FUNCTION__))
23437 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23438, __PRETTY_FUNCTION__))
23438 Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23438, __PRETTY_FUNCTION__))
;
23439 }
23440
23441 // Insert VAARG_64 node into the DAG
23442 // VAARG_64 returns two values: Variable Argument Address, Chain
23443 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
23444 DAG.getConstant(ArgMode, dl, MVT::i8),
23445 DAG.getConstant(Align, dl, MVT::i32)};
23446 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
23447 SDValue VAARG = DAG.getMemIntrinsicNode(
23448 X86ISD::VAARG_64, dl,
23449 VTs, InstOps, MVT::i64,
23450 MachinePointerInfo(SV),
23451 /*Align=*/0,
23452 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
23453 Chain = VAARG.getValue(1);
23454
23455 // Load the next argument and return it
23456 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
23457}
23458
23459static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
23460 SelectionDAG &DAG) {
23461 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
23462 // where a va_list is still an i8*.
23463 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23463, __PRETTY_FUNCTION__))
;
23464 if (Subtarget.isCallingConvWin64(
23465 DAG.getMachineFunction().getFunction().getCallingConv()))
23466 // Probably a Win64 va_copy.
23467 return DAG.expandVACopy(Op.getNode());
23468
23469 SDValue Chain = Op.getOperand(0);
23470 SDValue DstPtr = Op.getOperand(1);
23471 SDValue SrcPtr = Op.getOperand(2);
23472 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
23473 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
23474 SDLoc DL(Op);
23475
23476 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
23477 Align(8), /*isVolatile*/ false, false, false,
23478 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
23479}
23480
23481// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
23482static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
23483 switch (Opc) {
23484 case ISD::SHL:
23485 case X86ISD::VSHL:
23486 case X86ISD::VSHLI:
23487 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
23488 case ISD::SRL:
23489 case X86ISD::VSRL:
23490 case X86ISD::VSRLI:
23491 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
23492 case ISD::SRA:
23493 case X86ISD::VSRA:
23494 case X86ISD::VSRAI:
23495 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
23496 }
23497 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23497)
;
23498}
23499
23500/// Handle vector element shifts where the shift amount is a constant.
23501/// Takes immediate version of shift as input.
23502static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
23503 SDValue SrcOp, uint64_t ShiftAmt,
23504 SelectionDAG &DAG) {
23505 MVT ElementType = VT.getVectorElementType();
23506
23507 // Bitcast the source vector to the output type, this is mainly necessary for
23508 // vXi8/vXi64 shifts.
23509 if (VT != SrcOp.getSimpleValueType())
23510 SrcOp = DAG.getBitcast(VT, SrcOp);
23511
23512 // Fold this packed shift into its first operand if ShiftAmt is 0.
23513 if (ShiftAmt == 0)
23514 return SrcOp;
23515
23516 // Check for ShiftAmt >= element width
23517 if (ShiftAmt >= ElementType.getSizeInBits()) {
23518 if (Opc == X86ISD::VSRAI)
23519 ShiftAmt = ElementType.getSizeInBits() - 1;
23520 else
23521 return DAG.getConstant(0, dl, VT);
23522 }
23523
23524 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23525, __PRETTY_FUNCTION__))
23525 && "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23525, __PRETTY_FUNCTION__))
;
23526
23527 // Fold this packed vector shift into a build vector if SrcOp is a
23528 // vector of Constants or UNDEFs.
23529 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
23530 SmallVector<SDValue, 8> Elts;
23531 unsigned NumElts = SrcOp->getNumOperands();
23532
23533 switch (Opc) {
23534 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23534)
;
23535 case X86ISD::VSHLI:
23536 for (unsigned i = 0; i != NumElts; ++i) {
23537 SDValue CurrentOp = SrcOp->getOperand(i);
23538 if (CurrentOp->isUndef()) {
23539 Elts.push_back(CurrentOp);
23540 continue;
23541 }
23542 auto *ND = cast<ConstantSDNode>(CurrentOp);
23543 const APInt &C = ND->getAPIntValue();
23544 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
23545 }
23546 break;
23547 case X86ISD::VSRLI:
23548 for (unsigned i = 0; i != NumElts; ++i) {
23549 SDValue CurrentOp = SrcOp->getOperand(i);
23550 if (CurrentOp->isUndef()) {
23551 Elts.push_back(CurrentOp);
23552 continue;
23553 }
23554 auto *ND = cast<ConstantSDNode>(CurrentOp);
23555 const APInt &C = ND->getAPIntValue();
23556 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
23557 }
23558 break;
23559 case X86ISD::VSRAI:
23560 for (unsigned i = 0; i != NumElts; ++i) {
23561 SDValue CurrentOp = SrcOp->getOperand(i);
23562 if (CurrentOp->isUndef()) {
23563 Elts.push_back(CurrentOp);
23564 continue;
23565 }
23566 auto *ND = cast<ConstantSDNode>(CurrentOp);
23567 const APInt &C = ND->getAPIntValue();
23568 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
23569 }
23570 break;
23571 }
23572
23573 return DAG.getBuildVector(VT, dl, Elts);
23574 }
23575
23576 return DAG.getNode(Opc, dl, VT, SrcOp,
23577 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
23578}
23579
23580/// Handle vector element shifts where the shift amount may or may not be a
23581/// constant. Takes immediate version of shift as input.
23582static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
23583 SDValue SrcOp, SDValue ShAmt,
23584 const X86Subtarget &Subtarget,
23585 SelectionDAG &DAG) {
23586 MVT SVT = ShAmt.getSimpleValueType();
23587 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23587, __PRETTY_FUNCTION__))
;
23588
23589 // Catch shift-by-constant.
23590 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
23591 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
23592 CShAmt->getZExtValue(), DAG);
23593
23594 // Change opcode to non-immediate version.
23595 Opc = getTargetVShiftUniformOpcode(Opc, true);
23596
23597 // Need to build a vector containing shift amount.
23598 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
23599 // +====================+============+=======================================+
23600 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
23601 // +====================+============+=======================================+
23602 // | i64 | Yes, No | Use ShAmt as lowest elt |
23603 // | i32 | Yes | zero-extend in-reg |
23604 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
23605 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
23606 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
23607 // +====================+============+=======================================+
23608
23609 if (SVT == MVT::i64)
23610 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
23611 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
23612 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
23613 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
23614 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
23615 ShAmt = ShAmt.getOperand(0);
23616 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
23617 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
23618 if (Subtarget.hasSSE41())
23619 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
23620 MVT::v2i64, ShAmt);
23621 else {
23622 SDValue ByteShift = DAG.getTargetConstant(
23623 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
23624 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
23625 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
23626 ByteShift);
23627 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
23628 ByteShift);
23629 }
23630 } else if (Subtarget.hasSSE41() &&
23631 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23632 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
23633 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
23634 MVT::v2i64, ShAmt);
23635 } else {
23636 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
23637 DAG.getUNDEF(SVT)};
23638 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
23639 }
23640
23641 // The return type has to be a 128-bit type with the same element
23642 // type as the input type.
23643 MVT EltVT = VT.getVectorElementType();
23644 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
23645
23646 ShAmt = DAG.getBitcast(ShVT, ShAmt);
23647 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
23648}
23649
23650/// Return Mask with the necessary casting or extending
23651/// for \p Mask according to \p MaskVT when lowering masking intrinsics
23652static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
23653 const X86Subtarget &Subtarget, SelectionDAG &DAG,
23654 const SDLoc &dl) {
23655
23656 if (isAllOnesConstant(Mask))
23657 return DAG.getConstant(1, dl, MaskVT);
23658 if (X86::isZeroNode(Mask))
23659 return DAG.getConstant(0, dl, MaskVT);
23660
23661 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23661, __PRETTY_FUNCTION__))
;
23662
23663 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
23664 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((MaskVT == MVT::v64i1 && "Expected v64i1 mask!") ? static_cast
<void> (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23664, __PRETTY_FUNCTION__))
;
23665 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23665, __PRETTY_FUNCTION__))
;
23666 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
23667 SDValue Lo, Hi;
23668 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
23669 DAG.getConstant(0, dl, MVT::i32));
23670 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
23671 DAG.getConstant(1, dl, MVT::i32));
23672
23673 Lo = DAG.getBitcast(MVT::v32i1, Lo);
23674 Hi = DAG.getBitcast(MVT::v32i1, Hi);
23675
23676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
23677 } else {
23678 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
23679 Mask.getSimpleValueType().getSizeInBits());
23680 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
23681 // are extracted by EXTRACT_SUBVECTOR.
23682 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
23683 DAG.getBitcast(BitcastVT, Mask),
23684 DAG.getIntPtrConstant(0, dl));
23685 }
23686}
23687
23688/// Return (and \p Op, \p Mask) for compare instructions or
23689/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
23690/// necessary casting or extending for \p Mask when lowering masking intrinsics
23691static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
23692 SDValue PreservedSrc,
23693 const X86Subtarget &Subtarget,
23694 SelectionDAG &DAG) {
23695 MVT VT = Op.getSimpleValueType();
23696 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23697 unsigned OpcodeSelect = ISD::VSELECT;
23698 SDLoc dl(Op);
23699
23700 if (isAllOnesConstant(Mask))
23701 return Op;
23702
23703 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23704
23705 if (PreservedSrc.isUndef())
23706 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
23707 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
23708}
23709
23710/// Creates an SDNode for a predicated scalar operation.
23711/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
23712/// The mask is coming as MVT::i8 and it should be transformed
23713/// to MVT::v1i1 while lowering masking intrinsics.
23714/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
23715/// "X86select" instead of "vselect". We just can't create the "vselect" node
23716/// for a scalar instruction.
23717static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
23718 SDValue PreservedSrc,
23719 const X86Subtarget &Subtarget,
23720 SelectionDAG &DAG) {
23721
23722 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
23723 if (MaskConst->getZExtValue() & 0x1)
23724 return Op;
23725
23726 MVT VT = Op.getSimpleValueType();
23727 SDLoc dl(Op);
23728
23729 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((Mask.getValueType() == MVT::i8 && "Unexpect type") ?
static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23729, __PRETTY_FUNCTION__))
;
23730 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
23731 DAG.getBitcast(MVT::v8i1, Mask),
23732 DAG.getIntPtrConstant(0, dl));
23733 if (Op.getOpcode() == X86ISD::FSETCCM ||
23734 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
23735 Op.getOpcode() == X86ISD::VFPCLASSS)
23736 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
23737
23738 if (PreservedSrc.isUndef())
23739 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
23740 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
23741}
23742
23743static int getSEHRegistrationNodeSize(const Function *Fn) {
23744 if (!Fn->hasPersonalityFn())
23745 report_fatal_error(
23746 "querying registration node size for function without personality");
23747 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
23748 // WinEHStatePass for the full struct definition.
23749 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
23750 case EHPersonality::MSVC_X86SEH: return 24;
23751 case EHPersonality::MSVC_CXX: return 16;
23752 default: break;
23753 }
23754 report_fatal_error(
23755 "can only recover FP for 32-bit MSVC EH personality functions");
23756}
23757
23758/// When the MSVC runtime transfers control to us, either to an outlined
23759/// function or when returning to a parent frame after catching an exception, we
23760/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
23761/// Here's the math:
23762/// RegNodeBase = EntryEBP - RegNodeSize
23763/// ParentFP = RegNodeBase - ParentFrameOffset
23764/// Subtracting RegNodeSize takes us to the offset of the registration node, and
23765/// subtracting the offset (negative on x86) takes us back to the parent FP.
23766static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
23767 SDValue EntryEBP) {
23768 MachineFunction &MF = DAG.getMachineFunction();
23769 SDLoc dl;
23770
23771 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23772 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23773
23774 // It's possible that the parent function no longer has a personality function
23775 // if the exceptional code was optimized away, in which case we just return
23776 // the incoming EBP.
23777 if (!Fn->hasPersonalityFn())
23778 return EntryEBP;
23779
23780 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
23781 // registration, or the .set_setframe offset.
23782 MCSymbol *OffsetSym =
23783 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
23784 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
23785 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
23786 SDValue ParentFrameOffset =
23787 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
23788
23789 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
23790 // prologue to RBP in the parent function.
23791 const X86Subtarget &Subtarget =
23792 static_cast<const X86Subtarget &>(DAG.getSubtarget());
23793 if (Subtarget.is64Bit())
23794 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
23795
23796 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
23797 // RegNodeBase = EntryEBP - RegNodeSize
23798 // ParentFP = RegNodeBase - ParentFrameOffset
23799 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
23800 DAG.getConstant(RegNodeSize, dl, PtrVT));
23801 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
23802}
23803
23804SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
23805 SelectionDAG &DAG) const {
23806 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
23807 auto isRoundModeCurDirection = [](SDValue Rnd) {
23808 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
23809 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
23810
23811 return false;
23812 };
23813 auto isRoundModeSAE = [](SDValue Rnd) {
23814 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
23815 unsigned RC = C->getZExtValue();
23816 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
23817 // Clear the NO_EXC bit and check remaining bits.
23818 RC ^= X86::STATIC_ROUNDING::NO_EXC;
23819 // As a convenience we allow no other bits or explicitly
23820 // current direction.
23821 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
23822 }
23823 }
23824
23825 return false;
23826 };
23827 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
23828 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
23829 RC = C->getZExtValue();
23830 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
23831 // Clear the NO_EXC bit and check remaining bits.
23832 RC ^= X86::STATIC_ROUNDING::NO_EXC;
23833 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
23834 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
23835 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
23836 RC == X86::STATIC_ROUNDING::TO_ZERO;
23837 }
23838 }
23839
23840 return false;
23841 };
23842
23843 SDLoc dl(Op);
23844 unsigned IntNo = Op.getConstantOperandVal(0);
23845 MVT VT = Op.getSimpleValueType();
23846 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
23847
23848 if (IntrData) {
23849 switch(IntrData->Type) {
23850 case INTR_TYPE_1OP: {
23851 // We specify 2 possible opcodes for intrinsics with rounding modes.
23852 // First, we check if the intrinsic may have non-default rounding mode,
23853 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23854 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23855 if (IntrWithRoundingModeOpcode != 0) {
23856 SDValue Rnd = Op.getOperand(2);
23857 unsigned RC = 0;
23858 if (isRoundModeSAEToX(Rnd, RC))
23859 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23860 Op.getOperand(1),
23861 DAG.getTargetConstant(RC, dl, MVT::i32));
23862 if (!isRoundModeCurDirection(Rnd))
23863 return SDValue();
23864 }
23865 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23866 Op.getOperand(1));
23867 }
23868 case INTR_TYPE_1OP_SAE: {
23869 SDValue Sae = Op.getOperand(2);
23870
23871 unsigned Opc;
23872 if (isRoundModeCurDirection(Sae))
23873 Opc = IntrData->Opc0;
23874 else if (isRoundModeSAE(Sae))
23875 Opc = IntrData->Opc1;
23876 else
23877 return SDValue();
23878
23879 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
23880 }
23881 case INTR_TYPE_2OP: {
23882 SDValue Src2 = Op.getOperand(2);
23883
23884 // We specify 2 possible opcodes for intrinsics with rounding modes.
23885 // First, we check if the intrinsic may have non-default rounding mode,
23886 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23887 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23888 if (IntrWithRoundingModeOpcode != 0) {
23889 SDValue Rnd = Op.getOperand(3);
23890 unsigned RC = 0;
23891 if (isRoundModeSAEToX(Rnd, RC))
23892 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23893 Op.getOperand(1), Src2,
23894 DAG.getTargetConstant(RC, dl, MVT::i32));
23895 if (!isRoundModeCurDirection(Rnd))
23896 return SDValue();
23897 }
23898
23899 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23900 Op.getOperand(1), Src2);
23901 }
23902 case INTR_TYPE_2OP_SAE: {
23903 SDValue Sae = Op.getOperand(3);
23904
23905 unsigned Opc;
23906 if (isRoundModeCurDirection(Sae))
23907 Opc = IntrData->Opc0;
23908 else if (isRoundModeSAE(Sae))
23909 Opc = IntrData->Opc1;
23910 else
23911 return SDValue();
23912
23913 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
23914 Op.getOperand(2));
23915 }
23916 case INTR_TYPE_3OP:
23917 case INTR_TYPE_3OP_IMM8: {
23918 SDValue Src1 = Op.getOperand(1);
23919 SDValue Src2 = Op.getOperand(2);
23920 SDValue Src3 = Op.getOperand(3);
23921
23922 // We specify 2 possible opcodes for intrinsics with rounding modes.
23923 // First, we check if the intrinsic may have non-default rounding mode,
23924 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23925 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23926 if (IntrWithRoundingModeOpcode != 0) {
23927 SDValue Rnd = Op.getOperand(4);
23928 unsigned RC = 0;
23929 if (isRoundModeSAEToX(Rnd, RC))
23930 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23931 Src1, Src2, Src3,
23932 DAG.getTargetConstant(RC, dl, MVT::i32));
23933 if (!isRoundModeCurDirection(Rnd))
23934 return SDValue();
23935 }
23936
23937 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23938 {Src1, Src2, Src3});
23939 }
23940 case INTR_TYPE_4OP:
23941 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
23942 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
23943 case INTR_TYPE_1OP_MASK: {
23944 SDValue Src = Op.getOperand(1);
23945 SDValue PassThru = Op.getOperand(2);
23946 SDValue Mask = Op.getOperand(3);
23947 // We add rounding mode to the Node when
23948 // - RC Opcode is specified and
23949 // - RC is not "current direction".
23950 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23951 if (IntrWithRoundingModeOpcode != 0) {
23952 SDValue Rnd = Op.getOperand(4);
23953 unsigned RC = 0;
23954 if (isRoundModeSAEToX(Rnd, RC))
23955 return getVectorMaskingNode(
23956 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23957 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
23958 Mask, PassThru, Subtarget, DAG);
23959 if (!isRoundModeCurDirection(Rnd))
23960 return SDValue();
23961 }
23962 return getVectorMaskingNode(
23963 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
23964 Subtarget, DAG);
23965 }
23966 case INTR_TYPE_1OP_MASK_SAE: {
23967 SDValue Src = Op.getOperand(1);
23968 SDValue PassThru = Op.getOperand(2);
23969 SDValue Mask = Op.getOperand(3);
23970 SDValue Rnd = Op.getOperand(4);
23971
23972 unsigned Opc;
23973 if (isRoundModeCurDirection(Rnd))
23974 Opc = IntrData->Opc0;
23975 else if (isRoundModeSAE(Rnd))
23976 Opc = IntrData->Opc1;
23977 else
23978 return SDValue();
23979
23980 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
23981 Subtarget, DAG);
23982 }
23983 case INTR_TYPE_SCALAR_MASK: {
23984 SDValue Src1 = Op.getOperand(1);
23985 SDValue Src2 = Op.getOperand(2);
23986 SDValue passThru = Op.getOperand(3);
23987 SDValue Mask = Op.getOperand(4);
23988 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23989 // There are 2 kinds of intrinsics in this group:
23990 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
23991 // (2) With rounding mode and sae - 7 operands.
23992 bool HasRounding = IntrWithRoundingModeOpcode != 0;
23993 if (Op.getNumOperands() == (5U + HasRounding)) {
23994 if (HasRounding) {
23995 SDValue Rnd = Op.getOperand(5);
23996 unsigned RC = 0;
23997 if (isRoundModeSAEToX(Rnd, RC))
23998 return getScalarMaskingNode(
23999 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
24000 DAG.getTargetConstant(RC, dl, MVT::i32)),
24001 Mask, passThru, Subtarget, DAG);
24002 if (!isRoundModeCurDirection(Rnd))
24003 return SDValue();
24004 }
24005 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
24006 Src2),
24007 Mask, passThru, Subtarget, DAG);
24008 }
24009
24010 assert(Op.getNumOperands() == (6U + HasRounding) &&((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24011, __PRETTY_FUNCTION__))
24011 "Unexpected intrinsic form")((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24011, __PRETTY_FUNCTION__))
;
24012 SDValue RoundingMode = Op.getOperand(5);
24013 unsigned Opc = IntrData->Opc0;
24014 if (HasRounding) {
24015 SDValue Sae = Op.getOperand(6);
24016 if (isRoundModeSAE(Sae))
24017 Opc = IntrWithRoundingModeOpcode;
24018 else if (!isRoundModeCurDirection(Sae))
24019 return SDValue();
24020 }
24021 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
24022 Src2, RoundingMode),
24023 Mask, passThru, Subtarget, DAG);
24024 }
24025 case INTR_TYPE_SCALAR_MASK_RND: {
24026 SDValue Src1 = Op.getOperand(1);
24027 SDValue Src2 = Op.getOperand(2);
24028 SDValue passThru = Op.getOperand(3);
24029 SDValue Mask = Op.getOperand(4);
24030 SDValue Rnd = Op.getOperand(5);
24031
24032 SDValue NewOp;
24033 unsigned RC = 0;
24034 if (isRoundModeCurDirection(Rnd))
24035 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
24036 else if (isRoundModeSAEToX(Rnd, RC))
24037 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
24038 DAG.getTargetConstant(RC, dl, MVT::i32));
24039 else
24040 return SDValue();
24041
24042 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
24043 }
24044 case INTR_TYPE_SCALAR_MASK_SAE: {
24045 SDValue Src1 = Op.getOperand(1);
24046 SDValue Src2 = Op.getOperand(2);
24047 SDValue passThru = Op.getOperand(3);
24048 SDValue Mask = Op.getOperand(4);
24049 SDValue Sae = Op.getOperand(5);
24050 unsigned Opc;
24051 if (isRoundModeCurDirection(Sae))
24052 Opc = IntrData->Opc0;
24053 else if (isRoundModeSAE(Sae))
24054 Opc = IntrData->Opc1;
24055 else
24056 return SDValue();
24057
24058 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
24059 Mask, passThru, Subtarget, DAG);
24060 }
24061 case INTR_TYPE_2OP_MASK: {
24062 SDValue Src1 = Op.getOperand(1);
24063 SDValue Src2 = Op.getOperand(2);
24064 SDValue PassThru = Op.getOperand(3);
24065 SDValue Mask = Op.getOperand(4);
24066 SDValue NewOp;
24067 if (IntrData->Opc1 != 0) {
24068 SDValue Rnd = Op.getOperand(5);
24069 unsigned RC = 0;
24070 if (isRoundModeSAEToX(Rnd, RC))
24071 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
24072 DAG.getTargetConstant(RC, dl, MVT::i32));
24073 else if (!isRoundModeCurDirection(Rnd))
24074 return SDValue();
24075 }
24076 if (!NewOp)
24077 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
24078 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
24079 }
24080 case INTR_TYPE_2OP_MASK_SAE: {
24081 SDValue Src1 = Op.getOperand(1);
24082 SDValue Src2 = Op.getOperand(2);
24083 SDValue PassThru = Op.getOperand(3);
24084 SDValue Mask = Op.getOperand(4);
24085
24086 unsigned Opc = IntrData->Opc0;
24087 if (IntrData->Opc1 != 0) {
24088 SDValue Sae = Op.getOperand(5);
24089 if (isRoundModeSAE(Sae))
24090 Opc = IntrData->Opc1;
24091 else if (!isRoundModeCurDirection(Sae))
24092 return SDValue();
24093 }
24094
24095 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
24096 Mask, PassThru, Subtarget, DAG);
24097 }
24098 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
24099 SDValue Src1 = Op.getOperand(1);
24100 SDValue Src2 = Op.getOperand(2);
24101 SDValue Src3 = Op.getOperand(3);
24102 SDValue PassThru = Op.getOperand(4);
24103 SDValue Mask = Op.getOperand(5);
24104 SDValue Sae = Op.getOperand(6);
24105 unsigned Opc;
24106 if (isRoundModeCurDirection(Sae))
24107 Opc = IntrData->Opc0;
24108 else if (isRoundModeSAE(Sae))
24109 Opc = IntrData->Opc1;
24110 else
24111 return SDValue();
24112
24113 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
24114 Mask, PassThru, Subtarget, DAG);
24115 }
24116 case INTR_TYPE_3OP_MASK_SAE: {
24117 SDValue Src1 = Op.getOperand(1);
24118 SDValue Src2 = Op.getOperand(2);
24119 SDValue Src3 = Op.getOperand(3);
24120 SDValue PassThru = Op.getOperand(4);
24121 SDValue Mask = Op.getOperand(5);
24122
24123 unsigned Opc = IntrData->Opc0;
24124 if (IntrData->Opc1 != 0) {
24125 SDValue Sae = Op.getOperand(6);
24126 if (isRoundModeSAE(Sae))
24127 Opc = IntrData->Opc1;
24128 else if (!isRoundModeCurDirection(Sae))
24129 return SDValue();
24130 }
24131 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
24132 Mask, PassThru, Subtarget, DAG);
24133 }
24134 case BLENDV: {
24135 SDValue Src1 = Op.getOperand(1);
24136 SDValue Src2 = Op.getOperand(2);
24137 SDValue Src3 = Op.getOperand(3);
24138
24139 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
24140 Src3 = DAG.getBitcast(MaskVT, Src3);
24141
24142 // Reverse the operands to match VSELECT order.
24143 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
24144 }
24145 case VPERM_2OP : {
24146 SDValue Src1 = Op.getOperand(1);
24147 SDValue Src2 = Op.getOperand(2);
24148
24149 // Swap Src1 and Src2 in the node creation
24150 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
24151 }
24152 case IFMA_OP:
24153 // NOTE: We need to swizzle the operands to pass the multiply operands
24154 // first.
24155 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24156 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
24157 case FPCLASSS: {
24158 SDValue Src1 = Op.getOperand(1);
24159 SDValue Imm = Op.getOperand(2);
24160 SDValue Mask = Op.getOperand(3);
24161 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
24162 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
24163 Subtarget, DAG);
24164 // Need to fill with zeros to ensure the bitcast will produce zeroes
24165 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24166 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
24167 DAG.getConstant(0, dl, MVT::v8i1),
24168 FPclassMask, DAG.getIntPtrConstant(0, dl));
24169 return DAG.getBitcast(MVT::i8, Ins);
24170 }
24171
24172 case CMP_MASK_CC: {
24173 MVT MaskVT = Op.getSimpleValueType();
24174 SDValue CC = Op.getOperand(3);
24175 // We specify 2 possible opcodes for intrinsics with rounding modes.
24176 // First, we check if the intrinsic may have non-default rounding mode,
24177 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
24178 if (IntrData->Opc1 != 0) {
24179 SDValue Sae = Op.getOperand(4);
24180 if (isRoundModeSAE(Sae))
24181 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
24182 Op.getOperand(2), CC, Sae);
24183 if (!isRoundModeCurDirection(Sae))
24184 return SDValue();
24185 }
24186 //default rounding mode
24187 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
24188 {Op.getOperand(1), Op.getOperand(2), CC});
24189 }
24190 case CMP_MASK_SCALAR_CC: {
24191 SDValue Src1 = Op.getOperand(1);
24192 SDValue Src2 = Op.getOperand(2);
24193 SDValue CC = Op.getOperand(3);
24194 SDValue Mask = Op.getOperand(4);
24195
24196 SDValue Cmp;
24197 if (IntrData->Opc1 != 0) {
24198 SDValue Sae = Op.getOperand(5);
24199 if (isRoundModeSAE(Sae))
24200 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
24201 else if (!isRoundModeCurDirection(Sae))
24202 return SDValue();
24203 }
24204 //default rounding mode
24205 if (!Cmp.getNode())
24206 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
24207
24208 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
24209 Subtarget, DAG);
24210 // Need to fill with zeros to ensure the bitcast will produce zeroes
24211 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24212 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
24213 DAG.getConstant(0, dl, MVT::v8i1),
24214 CmpMask, DAG.getIntPtrConstant(0, dl));
24215 return DAG.getBitcast(MVT::i8, Ins);
24216 }
24217 case COMI: { // Comparison intrinsics
24218 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
24219 SDValue LHS = Op.getOperand(1);
24220 SDValue RHS = Op.getOperand(2);
24221 // Some conditions require the operands to be swapped.
24222 if (CC == ISD::SETLT || CC == ISD::SETLE)
24223 std::swap(LHS, RHS);
24224
24225 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
24226 SDValue SetCC;
24227 switch (CC) {
24228 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
24229 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
24230 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
24231 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
24232 break;
24233 }
24234 case ISD::SETNE: { // (ZF = 1 or PF = 1)
24235 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
24236 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
24237 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
24238 break;
24239 }
24240 case ISD::SETGT: // (CF = 0 and ZF = 0)
24241 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
24242 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
24243 break;
24244 }
24245 case ISD::SETGE: // CF = 0
24246 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
24247 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
24248 break;
24249 default:
24250 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24250)
;
24251 }
24252 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24253 }
24254 case COMI_RM: { // Comparison intrinsics with Sae
24255 SDValue LHS = Op.getOperand(1);
24256 SDValue RHS = Op.getOperand(2);
24257 unsigned CondVal = Op.getConstantOperandVal(3);
24258 SDValue Sae = Op.getOperand(4);
24259
24260 SDValue FCmp;
24261 if (isRoundModeCurDirection(Sae))
24262 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
24263 DAG.getTargetConstant(CondVal, dl, MVT::i8));
24264 else if (isRoundModeSAE(Sae))
24265 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
24266 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
24267 else
24268 return SDValue();
24269 // Need to fill with zeros to ensure the bitcast will produce zeroes
24270 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24271 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24272 DAG.getConstant(0, dl, MVT::v16i1),
24273 FCmp, DAG.getIntPtrConstant(0, dl));
24274 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
24275 DAG.getBitcast(MVT::i16, Ins));
24276 }
24277 case VSHIFT:
24278 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
24279 Op.getOperand(1), Op.getOperand(2), Subtarget,
24280 DAG);
24281 case COMPRESS_EXPAND_IN_REG: {
24282 SDValue Mask = Op.getOperand(3);
24283 SDValue DataToCompress = Op.getOperand(1);
24284 SDValue PassThru = Op.getOperand(2);
24285 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
24286 return Op.getOperand(1);
24287
24288 // Avoid false dependency.
24289 if (PassThru.isUndef())
24290 PassThru = DAG.getConstant(0, dl, VT);
24291
24292 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
24293 Mask);
24294 }
24295 case FIXUPIMM:
24296 case FIXUPIMM_MASKZ: {
24297 SDValue Src1 = Op.getOperand(1);
24298 SDValue Src2 = Op.getOperand(2);
24299 SDValue Src3 = Op.getOperand(3);
24300 SDValue Imm = Op.getOperand(4);
24301 SDValue Mask = Op.getOperand(5);
24302 SDValue Passthru = (IntrData->Type == FIXUPIMM)
24303 ? Src1
24304 : getZeroVector(VT, Subtarget, DAG, dl);
24305
24306 unsigned Opc = IntrData->Opc0;
24307 if (IntrData->Opc1 != 0) {
24308 SDValue Sae = Op.getOperand(6);
24309 if (isRoundModeSAE(Sae))
24310 Opc = IntrData->Opc1;
24311 else if (!isRoundModeCurDirection(Sae))
24312 return SDValue();
24313 }
24314
24315 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
24316
24317 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
24318 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
24319
24320 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
24321 }
24322 case ROUNDP: {
24323 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24323, __PRETTY_FUNCTION__))
;
24324 // Clear the upper bits of the rounding immediate so that the legacy
24325 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
24326 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
24327 SDValue RoundingMode =
24328 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
24329 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24330 Op.getOperand(1), RoundingMode);
24331 }
24332 case ROUNDS: {
24333 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24333, __PRETTY_FUNCTION__))
;
24334 // Clear the upper bits of the rounding immediate so that the legacy
24335 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
24336 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
24337 SDValue RoundingMode =
24338 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
24339 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24340 Op.getOperand(1), Op.getOperand(2), RoundingMode);
24341 }
24342 case BEXTRI: {
24343 assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode")((IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::BEXTR && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24343, __PRETTY_FUNCTION__))
;
24344
24345 // The control is a TargetConstant, but we need to convert it to a
24346 // ConstantSDNode.
24347 uint64_t Imm = Op.getConstantOperandVal(2);
24348 SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
24349 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24350 Op.getOperand(1), Control);
24351 }
24352 // ADC/ADCX/SBB
24353 case ADX: {
24354 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
24355 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
24356
24357 SDValue Res;
24358 // If the carry in is zero, then we should just use ADD/SUB instead of
24359 // ADC/SBB.
24360 if (isNullConstant(Op.getOperand(1))) {
24361 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
24362 Op.getOperand(3));
24363 } else {
24364 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
24365 DAG.getConstant(-1, dl, MVT::i8));
24366 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
24367 Op.getOperand(3), GenCF.getValue(1));
24368 }
24369 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
24370 SDValue Results[] = { SetCC, Res };
24371 return DAG.getMergeValues(Results, dl);
24372 }
24373 case CVTPD2PS_MASK:
24374 case CVTPD2DQ_MASK:
24375 case CVTQQ2PS_MASK:
24376 case TRUNCATE_TO_REG: {
24377 SDValue Src = Op.getOperand(1);
24378 SDValue PassThru = Op.getOperand(2);
24379 SDValue Mask = Op.getOperand(3);
24380
24381 if (isAllOnesConstant(Mask))
24382 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
24383
24384 MVT SrcVT = Src.getSimpleValueType();
24385 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
24386 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24387 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
24388 {Src, PassThru, Mask});
24389 }
24390 case CVTPS2PH_MASK: {
24391 SDValue Src = Op.getOperand(1);
24392 SDValue Rnd = Op.getOperand(2);
24393 SDValue PassThru = Op.getOperand(3);
24394 SDValue Mask = Op.getOperand(4);
24395
24396 if (isAllOnesConstant(Mask))
24397 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
24398
24399 MVT SrcVT = Src.getSimpleValueType();
24400 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
24401 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24402 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
24403 PassThru, Mask);
24404
24405 }
24406 case CVTNEPS2BF16_MASK: {
24407 SDValue Src = Op.getOperand(1);
24408 SDValue PassThru = Op.getOperand(2);
24409 SDValue Mask = Op.getOperand(3);
24410
24411 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24412 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
24413
24414 // Break false dependency.
24415 if (PassThru.isUndef())
24416 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
24417
24418 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
24419 Mask);
24420 }
24421 default:
24422 break;
24423 }
24424 }
24425
24426 switch (IntNo) {
24427 default: return SDValue(); // Don't custom lower most intrinsics.
24428
24429 // ptest and testp intrinsics. The intrinsic these come from are designed to
24430 // return an integer value, not just an instruction so lower it to the ptest
24431 // or testp pattern and a setcc for the result.
24432 case Intrinsic::x86_avx512_ktestc_b:
24433 case Intrinsic::x86_avx512_ktestc_w:
24434 case Intrinsic::x86_avx512_ktestc_d:
24435 case Intrinsic::x86_avx512_ktestc_q:
24436 case Intrinsic::x86_avx512_ktestz_b:
24437 case Intrinsic::x86_avx512_ktestz_w:
24438 case Intrinsic::x86_avx512_ktestz_d:
24439 case Intrinsic::x86_avx512_ktestz_q:
24440 case Intrinsic::x86_sse41_ptestz:
24441 case Intrinsic::x86_sse41_ptestc:
24442 case Intrinsic::x86_sse41_ptestnzc:
24443 case Intrinsic::x86_avx_ptestz_256:
24444 case Intrinsic::x86_avx_ptestc_256:
24445 case Intrinsic::x86_avx_ptestnzc_256:
24446 case Intrinsic::x86_avx_vtestz_ps:
24447 case Intrinsic::x86_avx_vtestc_ps:
24448 case Intrinsic::x86_avx_vtestnzc_ps:
24449 case Intrinsic::x86_avx_vtestz_pd:
24450 case Intrinsic::x86_avx_vtestc_pd:
24451 case Intrinsic::x86_avx_vtestnzc_pd:
24452 case Intrinsic::x86_avx_vtestz_ps_256:
24453 case Intrinsic::x86_avx_vtestc_ps_256:
24454 case Intrinsic::x86_avx_vtestnzc_ps_256:
24455 case Intrinsic::x86_avx_vtestz_pd_256:
24456 case Intrinsic::x86_avx_vtestc_pd_256:
24457 case Intrinsic::x86_avx_vtestnzc_pd_256: {
24458 unsigned TestOpc = X86ISD::PTEST;
24459 X86::CondCode X86CC;
24460 switch (IntNo) {
24461 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24461)
;
24462 case Intrinsic::x86_avx512_ktestc_b:
24463 case Intrinsic::x86_avx512_ktestc_w:
24464 case Intrinsic::x86_avx512_ktestc_d:
24465 case Intrinsic::x86_avx512_ktestc_q:
24466 // CF = 1
24467 TestOpc = X86ISD::KTEST;
24468 X86CC = X86::COND_B;
24469 break;
24470 case Intrinsic::x86_avx512_ktestz_b:
24471 case Intrinsic::x86_avx512_ktestz_w:
24472 case Intrinsic::x86_avx512_ktestz_d:
24473 case Intrinsic::x86_avx512_ktestz_q:
24474 TestOpc = X86ISD::KTEST;
24475 X86CC = X86::COND_E;
24476 break;
24477 case Intrinsic::x86_avx_vtestz_ps:
24478 case Intrinsic::x86_avx_vtestz_pd:
24479 case Intrinsic::x86_avx_vtestz_ps_256:
24480 case Intrinsic::x86_avx_vtestz_pd_256:
24481 TestOpc = X86ISD::TESTP;
24482 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24483 case Intrinsic::x86_sse41_ptestz:
24484 case Intrinsic::x86_avx_ptestz_256:
24485 // ZF = 1
24486 X86CC = X86::COND_E;
24487 break;
24488 case Intrinsic::x86_avx_vtestc_ps:
24489 case Intrinsic::x86_avx_vtestc_pd:
24490 case Intrinsic::x86_avx_vtestc_ps_256:
24491 case Intrinsic::x86_avx_vtestc_pd_256:
24492 TestOpc = X86ISD::TESTP;
24493 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24494 case Intrinsic::x86_sse41_ptestc:
24495 case Intrinsic::x86_avx_ptestc_256:
24496 // CF = 1
24497 X86CC = X86::COND_B;
24498 break;
24499 case Intrinsic::x86_avx_vtestnzc_ps:
24500 case Intrinsic::x86_avx_vtestnzc_pd:
24501 case Intrinsic::x86_avx_vtestnzc_ps_256:
24502 case Intrinsic::x86_avx_vtestnzc_pd_256:
24503 TestOpc = X86ISD::TESTP;
24504 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24505 case Intrinsic::x86_sse41_ptestnzc:
24506 case Intrinsic::x86_avx_ptestnzc_256:
24507 // ZF and CF = 0
24508 X86CC = X86::COND_A;
24509 break;
24510 }
24511
24512 SDValue LHS = Op.getOperand(1);
24513 SDValue RHS = Op.getOperand(2);
24514 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
24515 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
24516 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24517 }
24518
24519 case Intrinsic::x86_sse42_pcmpistria128:
24520 case Intrinsic::x86_sse42_pcmpestria128:
24521 case Intrinsic::x86_sse42_pcmpistric128:
24522 case Intrinsic::x86_sse42_pcmpestric128:
24523 case Intrinsic::x86_sse42_pcmpistrio128:
24524 case Intrinsic::x86_sse42_pcmpestrio128:
24525 case Intrinsic::x86_sse42_pcmpistris128:
24526 case Intrinsic::x86_sse42_pcmpestris128:
24527 case Intrinsic::x86_sse42_pcmpistriz128:
24528 case Intrinsic::x86_sse42_pcmpestriz128: {
24529 unsigned Opcode;
24530 X86::CondCode X86CC;
24531 switch (IntNo) {
24532 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24532)
; // Can't reach here.
24533 case Intrinsic::x86_sse42_pcmpistria128:
24534 Opcode = X86ISD::PCMPISTR;
24535 X86CC = X86::COND_A;
24536 break;
24537 case Intrinsic::x86_sse42_pcmpestria128:
24538 Opcode = X86ISD::PCMPESTR;
24539 X86CC = X86::COND_A;
24540 break;
24541 case Intrinsic::x86_sse42_pcmpistric128:
24542 Opcode = X86ISD::PCMPISTR;
24543 X86CC = X86::COND_B;
24544 break;
24545 case Intrinsic::x86_sse42_pcmpestric128:
24546 Opcode = X86ISD::PCMPESTR;
24547 X86CC = X86::COND_B;
24548 break;
24549 case Intrinsic::x86_sse42_pcmpistrio128:
24550 Opcode = X86ISD::PCMPISTR;
24551 X86CC = X86::COND_O;
24552 break;
24553 case Intrinsic::x86_sse42_pcmpestrio128:
24554 Opcode = X86ISD::PCMPESTR;
24555 X86CC = X86::COND_O;
24556 break;
24557 case Intrinsic::x86_sse42_pcmpistris128:
24558 Opcode = X86ISD::PCMPISTR;
24559 X86CC = X86::COND_S;
24560 break;
24561 case Intrinsic::x86_sse42_pcmpestris128:
24562 Opcode = X86ISD::PCMPESTR;
24563 X86CC = X86::COND_S;
24564 break;
24565 case Intrinsic::x86_sse42_pcmpistriz128:
24566 Opcode = X86ISD::PCMPISTR;
24567 X86CC = X86::COND_E;
24568 break;
24569 case Intrinsic::x86_sse42_pcmpestriz128:
24570 Opcode = X86ISD::PCMPESTR;
24571 X86CC = X86::COND_E;
24572 break;
24573 }
24574 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24575 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24576 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
24577 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
24578 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24579 }
24580
24581 case Intrinsic::x86_sse42_pcmpistri128:
24582 case Intrinsic::x86_sse42_pcmpestri128: {
24583 unsigned Opcode;
24584 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
24585 Opcode = X86ISD::PCMPISTR;
24586 else
24587 Opcode = X86ISD::PCMPESTR;
24588
24589 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24590 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24591 return DAG.getNode(Opcode, dl, VTs, NewOps);
24592 }
24593
24594 case Intrinsic::x86_sse42_pcmpistrm128:
24595 case Intrinsic::x86_sse42_pcmpestrm128: {
24596 unsigned Opcode;
24597 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
24598 Opcode = X86ISD::PCMPISTR;
24599 else
24600 Opcode = X86ISD::PCMPESTR;
24601
24602 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24603 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24604 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
24605 }
24606
24607 case Intrinsic::eh_sjlj_lsda: {
24608 MachineFunction &MF = DAG.getMachineFunction();
24609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24610 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24611 auto &Context = MF.getMMI().getContext();
24612 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
24613 Twine(MF.getFunctionNumber()));
24614 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
24615 DAG.getMCSymbol(S, PtrVT));
24616 }
24617
24618 case Intrinsic::x86_seh_lsda: {
24619 // Compute the symbol for the LSDA. We know it'll get emitted later.
24620 MachineFunction &MF = DAG.getMachineFunction();
24621 SDValue Op1 = Op.getOperand(1);
24622 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
24623 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
24624 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
24625
24626 // Generate a simple absolute symbol reference. This intrinsic is only
24627 // supported on 32-bit Windows, which isn't PIC.
24628 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
24629 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
24630 }
24631
24632 case Intrinsic::eh_recoverfp: {
24633 SDValue FnOp = Op.getOperand(1);
24634 SDValue IncomingFPOp = Op.getOperand(2);
24635 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
24636 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
24637 if (!Fn)
24638 report_fatal_error(
24639 "llvm.eh.recoverfp must take a function as the first argument");
24640 return recoverFramePointer(DAG, Fn, IncomingFPOp);
24641 }
24642
24643 case Intrinsic::localaddress: {
24644 // Returns one of the stack, base, or frame pointer registers, depending on
24645 // which is used to reference local variables.
24646 MachineFunction &MF = DAG.getMachineFunction();
24647 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24648 unsigned Reg;
24649 if (RegInfo->hasBasePointer(MF))
24650 Reg = RegInfo->getBaseRegister();
24651 else { // Handles the SP or FP case.
24652 bool CantUseFP = RegInfo->needsStackRealignment(MF);
24653 if (CantUseFP)
24654 Reg = RegInfo->getPtrSizedStackRegister(MF);
24655 else
24656 Reg = RegInfo->getPtrSizedFrameRegister(MF);
24657 }
24658 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
24659 }
24660
24661 case Intrinsic::x86_avx512_vp2intersect_q_512:
24662 case Intrinsic::x86_avx512_vp2intersect_q_256:
24663 case Intrinsic::x86_avx512_vp2intersect_q_128:
24664 case Intrinsic::x86_avx512_vp2intersect_d_512:
24665 case Intrinsic::x86_avx512_vp2intersect_d_256:
24666 case Intrinsic::x86_avx512_vp2intersect_d_128: {
24667 MVT MaskVT = Op.getSimpleValueType();
24668
24669 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
24670 SDLoc DL(Op);
24671
24672 SDValue Operation =
24673 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
24674 Op->getOperand(1), Op->getOperand(2));
24675
24676 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
24677 MaskVT, Operation);
24678 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
24679 MaskVT, Operation);
24680 return DAG.getMergeValues({Result0, Result1}, DL);
24681 }
24682 case Intrinsic::x86_mmx_pslli_w:
24683 case Intrinsic::x86_mmx_pslli_d:
24684 case Intrinsic::x86_mmx_pslli_q:
24685 case Intrinsic::x86_mmx_psrli_w:
24686 case Intrinsic::x86_mmx_psrli_d:
24687 case Intrinsic::x86_mmx_psrli_q:
24688 case Intrinsic::x86_mmx_psrai_w:
24689 case Intrinsic::x86_mmx_psrai_d: {
24690 SDLoc DL(Op);
24691 SDValue ShAmt = Op.getOperand(2);
24692 // If the argument is a constant, convert it to a target constant.
24693 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
24694 // Clamp out of bounds shift amounts since they will otherwise be masked
24695 // to 8-bits which may make it no longer out of bounds.
24696 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
24697 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
24698 Op.getOperand(0), Op.getOperand(1),
24699 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
24700 }
24701
24702 unsigned NewIntrinsic;
24703 switch (IntNo) {
24704 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24704)
; // Can't reach here.
24705 case Intrinsic::x86_mmx_pslli_w:
24706 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
24707 break;
24708 case Intrinsic::x86_mmx_pslli_d:
24709 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
24710 break;
24711 case Intrinsic::x86_mmx_pslli_q:
24712 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
24713 break;
24714 case Intrinsic::x86_mmx_psrli_w:
24715 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
24716 break;
24717 case Intrinsic::x86_mmx_psrli_d:
24718 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
24719 break;
24720 case Intrinsic::x86_mmx_psrli_q:
24721 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
24722 break;
24723 case Intrinsic::x86_mmx_psrai_w:
24724 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
24725 break;
24726 case Intrinsic::x86_mmx_psrai_d:
24727 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
24728 break;
24729 }
24730
24731 // The vector shift intrinsics with scalars uses 32b shift amounts but
24732 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
24733 // MMX register.
24734 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
24735 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
24736 DAG.getConstant(NewIntrinsic, DL, MVT::i32),
24737 Op.getOperand(1), ShAmt);
24738
24739 }
24740 }
24741}
24742
24743static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24744 SDValue Src, SDValue Mask, SDValue Base,
24745 SDValue Index, SDValue ScaleOp, SDValue Chain,
24746 const X86Subtarget &Subtarget) {
24747 SDLoc dl(Op);
24748 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24749 // Scale must be constant.
24750 if (!C)
24751 return SDValue();
24752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24753 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24754 TLI.getPointerTy(DAG.getDataLayout()));
24755 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
24756 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
24757 // If source is undef or we know it won't be used, use a zero vector
24758 // to break register dependency.
24759 // TODO: use undef instead and let BreakFalseDeps deal with it?
24760 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
24761 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
24762
24763 // Cast mask to an integer type.
24764 Mask = DAG.getBitcast(MaskVT, Mask);
24765
24766 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24767
24768 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
24769 SDValue Res =
24770 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
24771 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24772 return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
24773}
24774
24775static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
24776 SDValue Src, SDValue Mask, SDValue Base,
24777 SDValue Index, SDValue ScaleOp, SDValue Chain,
24778 const X86Subtarget &Subtarget) {
24779 MVT VT = Op.getSimpleValueType();
24780 SDLoc dl(Op);
24781 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24782 // Scale must be constant.
24783 if (!C)
24784 return SDValue();
24785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24786 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24787 TLI.getPointerTy(DAG.getDataLayout()));
24788 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
24789 VT.getVectorNumElements());
24790 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
24791
24792 // We support two versions of the gather intrinsics. One with scalar mask and
24793 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
24794 if (Mask.getValueType() != MaskVT)
24795 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24796
24797 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
24798 // If source is undef or we know it won't be used, use a zero vector
24799 // to break register dependency.
24800 // TODO: use undef instead and let BreakFalseDeps deal with it?
24801 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
24802 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
24803
24804 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24805
24806 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
24807 SDValue Res =
24808 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
24809 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24810 return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
24811}
24812
24813static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24814 SDValue Src, SDValue Mask, SDValue Base,
24815 SDValue Index, SDValue ScaleOp, SDValue Chain,
24816 const X86Subtarget &Subtarget) {
24817 SDLoc dl(Op);
24818 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24819 // Scale must be constant.
24820 if (!C)
24821 return SDValue();
24822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24823 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24824 TLI.getPointerTy(DAG.getDataLayout()));
24825 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
24826 Src.getSimpleValueType().getVectorNumElements());
24827 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
24828
24829 // We support two versions of the scatter intrinsics. One with scalar mask and
24830 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
24831 if (Mask.getValueType() != MaskVT)
24832 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24833
24834 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24835
24836 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
24837 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
24838 SDValue Res =
24839 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
24840 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24841 return Res.getValue(1);
24842}
24843
24844static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24845 SDValue Mask, SDValue Base, SDValue Index,
24846 SDValue ScaleOp, SDValue Chain,
24847 const X86Subtarget &Subtarget) {
24848 SDLoc dl(Op);
24849 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24850 // Scale must be constant.
24851 if (!C)
24852 return SDValue();
24853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24854 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24855 TLI.getPointerTy(DAG.getDataLayout()));
24856 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
24857 SDValue Segment = DAG.getRegister(0, MVT::i32);
24858 MVT MaskVT =
24859 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
24860 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24861 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
24862 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
24863 return SDValue(Res, 0);
24864}
24865
24866/// Handles the lowering of builtin intrinsics with chain that return their
24867/// value into registers EDX:EAX.
24868/// If operand ScrReg is a valid register identifier, then operand 2 of N is
24869/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
24870/// TargetOpcode.
24871/// Returns a Glue value which can be used to add extra copy-from-reg if the
24872/// expanded intrinsics implicitly defines extra registers (i.e. not just
24873/// EDX:EAX).
24874static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
24875 SelectionDAG &DAG,
24876 unsigned TargetOpcode,
24877 unsigned SrcReg,
24878 const X86Subtarget &Subtarget,
24879 SmallVectorImpl<SDValue> &Results) {
24880 SDValue Chain = N->getOperand(0);
24881 SDValue Glue;
24882
24883 if (SrcReg) {
24884 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24884, __PRETTY_FUNCTION__))
;
24885 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
24886 Glue = Chain.getValue(1);
24887 }
24888
24889 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24890 SDValue N1Ops[] = {Chain, Glue};
24891 SDNode *N1 = DAG.getMachineNode(
24892 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
24893 Chain = SDValue(N1, 0);
24894
24895 // Reads the content of XCR and returns it in registers EDX:EAX.
24896 SDValue LO, HI;
24897 if (Subtarget.is64Bit()) {
24898 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
24899 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
24900 LO.getValue(2));
24901 } else {
24902 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
24903 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
24904 LO.getValue(2));
24905 }
24906 Chain = HI.getValue(1);
24907 Glue = HI.getValue(2);
24908
24909 if (Subtarget.is64Bit()) {
24910 // Merge the two 32-bit values into a 64-bit one.
24911 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
24912 DAG.getConstant(32, DL, MVT::i8));
24913 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
24914 Results.push_back(Chain);
24915 return Glue;
24916 }
24917
24918 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
24919 SDValue Ops[] = { LO, HI };
24920 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
24921 Results.push_back(Pair);
24922 Results.push_back(Chain);
24923 return Glue;
24924}
24925
24926/// Handles the lowering of builtin intrinsics that read the time stamp counter
24927/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
24928/// READCYCLECOUNTER nodes.
24929static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
24930 SelectionDAG &DAG,
24931 const X86Subtarget &Subtarget,
24932 SmallVectorImpl<SDValue> &Results) {
24933 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
24934 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
24935 // and the EAX register is loaded with the low-order 32 bits.
24936 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
24937 /* NoRegister */0, Subtarget,
24938 Results);
24939 if (Opcode != X86::RDTSCP)
24940 return;
24941
24942 SDValue Chain = Results[1];
24943 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
24944 // the ECX register. Add 'ecx' explicitly to the chain.
24945 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
24946 Results[1] = ecx;
24947 Results.push_back(ecx.getValue(1));
24948}
24949
24950static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
24951 SelectionDAG &DAG) {
24952 SmallVector<SDValue, 3> Results;
24953 SDLoc DL(Op);
24954 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
24955 Results);
24956 return DAG.getMergeValues(Results, DL);
24957}
24958
24959static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
24960 MachineFunction &MF = DAG.getMachineFunction();
24961 SDValue Chain = Op.getOperand(0);
24962 SDValue RegNode = Op.getOperand(2);
24963 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
24964 if (!EHInfo)
24965 report_fatal_error("EH registrations only live in functions using WinEH");
24966
24967 // Cast the operand to an alloca, and remember the frame index.
24968 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
24969 if (!FINode)
24970 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
24971 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
24972
24973 // Return the chain operand without making any DAG nodes.
24974 return Chain;
24975}
24976
24977static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
24978 MachineFunction &MF = DAG.getMachineFunction();
24979 SDValue Chain = Op.getOperand(0);
24980 SDValue EHGuard = Op.getOperand(2);
24981 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
24982 if (!EHInfo)
24983 report_fatal_error("EHGuard only live in functions using WinEH");
24984
24985 // Cast the operand to an alloca, and remember the frame index.
24986 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
24987 if (!FINode)
24988 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
24989 EHInfo->EHGuardFrameIndex = FINode->getIndex();
24990
24991 // Return the chain operand without making any DAG nodes.
24992 return Chain;
24993}
24994
24995/// Emit Truncating Store with signed or unsigned saturation.
24996static SDValue
24997EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
24998 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
24999 SelectionDAG &DAG) {
25000 SDVTList VTs = DAG.getVTList(MVT::Other);
25001 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
25002 SDValue Ops[] = { Chain, Val, Ptr, Undef };
25003 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
25004 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25005}
25006
25007/// Emit Masked Truncating Store with signed or unsigned saturation.
25008static SDValue
25009EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
25010 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
25011 MachineMemOperand *MMO, SelectionDAG &DAG) {
25012 SDVTList VTs = DAG.getVTList(MVT::Other);
25013 SDValue Ops[] = { Chain, Val, Ptr, Mask };
25014 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
25015 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25016}
25017
25018static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
25019 SelectionDAG &DAG) {
25020 unsigned IntNo = Op.getConstantOperandVal(1);
25021 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
25022 if (!IntrData) {
25023 switch (IntNo) {
25024 case llvm::Intrinsic::x86_seh_ehregnode:
25025 return MarkEHRegistrationNode(Op, DAG);
25026 case llvm::Intrinsic::x86_seh_ehguard:
25027 return MarkEHGuard(Op, DAG);
25028 case llvm::Intrinsic::x86_rdpkru: {
25029 SDLoc dl(Op);
25030 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25031 // Create a RDPKRU node and pass 0 to the ECX parameter.
25032 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
25033 DAG.getConstant(0, dl, MVT::i32));
25034 }
25035 case llvm::Intrinsic::x86_wrpkru: {
25036 SDLoc dl(Op);
25037 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
25038 // to the EDX and ECX parameters.
25039 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
25040 Op.getOperand(0), Op.getOperand(2),
25041 DAG.getConstant(0, dl, MVT::i32),
25042 DAG.getConstant(0, dl, MVT::i32));
25043 }
25044 case llvm::Intrinsic::x86_flags_read_u32:
25045 case llvm::Intrinsic::x86_flags_read_u64:
25046 case llvm::Intrinsic::x86_flags_write_u32:
25047 case llvm::Intrinsic::x86_flags_write_u64: {
25048 // We need a frame pointer because this will get lowered to a PUSH/POP
25049 // sequence.
25050 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
25051 MFI.setHasCopyImplyingStackAdjustment(true);
25052 // Don't do anything here, we will expand these intrinsics out later
25053 // during FinalizeISel in EmitInstrWithCustomInserter.
25054 return Op;
25055 }
25056 case Intrinsic::x86_lwpins32:
25057 case Intrinsic::x86_lwpins64:
25058 case Intrinsic::x86_umwait:
25059 case Intrinsic::x86_tpause: {
25060 SDLoc dl(Op);
25061 SDValue Chain = Op->getOperand(0);
25062 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25063 unsigned Opcode;
25064
25065 switch (IntNo) {
25066 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25066)
;
25067 case Intrinsic::x86_umwait:
25068 Opcode = X86ISD::UMWAIT;
25069 break;
25070 case Intrinsic::x86_tpause:
25071 Opcode = X86ISD::TPAUSE;
25072 break;
25073 case Intrinsic::x86_lwpins32:
25074 case Intrinsic::x86_lwpins64:
25075 Opcode = X86ISD::LWPINS;
25076 break;
25077 }
25078
25079 SDValue Operation =
25080 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
25081 Op->getOperand(3), Op->getOperand(4));
25082 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
25083 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
25084 Operation.getValue(1));
25085 }
25086 case Intrinsic::x86_enqcmd:
25087 case Intrinsic::x86_enqcmds: {
25088 SDLoc dl(Op);
25089 SDValue Chain = Op.getOperand(0);
25090 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25091 unsigned Opcode;
25092 switch (IntNo) {
25093 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25093)
;
25094 case Intrinsic::x86_enqcmd:
25095 Opcode = X86ISD::ENQCMD;
25096 break;
25097 case Intrinsic::x86_enqcmds:
25098 Opcode = X86ISD::ENQCMDS;
25099 break;
25100 }
25101 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
25102 Op.getOperand(3));
25103 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
25104 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
25105 Operation.getValue(1));
25106 }
25107 }
25108 return SDValue();
25109 }
25110
25111 SDLoc dl(Op);
25112 switch(IntrData->Type) {
25113 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25113)
;
25114 case RDSEED:
25115 case RDRAND: {
25116 // Emit the node with the right value type.
25117 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
25118 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
25119
25120 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
25121 // Otherwise return the value from Rand, which is always 0, casted to i32.
25122 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
25123 DAG.getConstant(1, dl, Op->getValueType(1)),
25124 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
25125 SDValue(Result.getNode(), 1)};
25126 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
25127
25128 // Return { result, isValid, chain }.
25129 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
25130 SDValue(Result.getNode(), 2));
25131 }
25132 case GATHER_AVX2: {
25133 SDValue Chain = Op.getOperand(0);
25134 SDValue Src = Op.getOperand(2);
25135 SDValue Base = Op.getOperand(3);
25136 SDValue Index = Op.getOperand(4);
25137 SDValue Mask = Op.getOperand(5);
25138 SDValue Scale = Op.getOperand(6);
25139 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
25140 Scale, Chain, Subtarget);
25141 }
25142 case GATHER: {
25143 //gather(v1, mask, index, base, scale);
25144 SDValue Chain = Op.getOperand(0);
25145 SDValue Src = Op.getOperand(2);
25146 SDValue Base = Op.getOperand(3);
25147 SDValue Index = Op.getOperand(4);
25148 SDValue Mask = Op.getOperand(5);
25149 SDValue Scale = Op.getOperand(6);
25150 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
25151 Chain, Subtarget);
25152 }
25153 case SCATTER: {
25154 //scatter(base, mask, index, v1, scale);
25155 SDValue Chain = Op.getOperand(0);
25156 SDValue Base = Op.getOperand(2);
25157 SDValue Mask = Op.getOperand(3);
25158 SDValue Index = Op.getOperand(4);
25159 SDValue Src = Op.getOperand(5);
25160 SDValue Scale = Op.getOperand(6);
25161 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
25162 Scale, Chain, Subtarget);
25163 }
25164 case PREFETCH: {
25165 const APInt &HintVal = Op.getConstantOperandAPInt(6);
25166 assert((HintVal == 2 || HintVal == 3) &&(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25167, __PRETTY_FUNCTION__))
25167 "Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25167, __PRETTY_FUNCTION__))
;
25168 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
25169 SDValue Chain = Op.getOperand(0);
25170 SDValue Mask = Op.getOperand(2);
25171 SDValue Index = Op.getOperand(3);
25172 SDValue Base = Op.getOperand(4);
25173 SDValue Scale = Op.getOperand(5);
25174 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
25175 Subtarget);
25176 }
25177 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
25178 case RDTSC: {
25179 SmallVector<SDValue, 2> Results;
25180 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
25181 Results);
25182 return DAG.getMergeValues(Results, dl);
25183 }
25184 // Read Performance Monitoring Counters.
25185 case RDPMC:
25186 // GetExtended Control Register.
25187 case XGETBV: {
25188 SmallVector<SDValue, 2> Results;
25189
25190 // RDPMC uses ECX to select the index of the performance counter to read.
25191 // XGETBV uses ECX to select the index of the XCR register to return.
25192 // The result is stored into registers EDX:EAX.
25193 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
25194 Subtarget, Results);
25195 return DAG.getMergeValues(Results, dl);
25196 }
25197 // XTEST intrinsics.
25198 case XTEST: {
25199 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
25200 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
25201
25202 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
25203 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
25204 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
25205 Ret, SDValue(InTrans.getNode(), 1));
25206 }
25207 case TRUNCATE_TO_MEM_VI8:
25208 case TRUNCATE_TO_MEM_VI16:
25209 case TRUNCATE_TO_MEM_VI32: {
25210 SDValue Mask = Op.getOperand(4);
25211 SDValue DataToTruncate = Op.getOperand(3);
25212 SDValue Addr = Op.getOperand(2);
25213 SDValue Chain = Op.getOperand(0);
25214
25215 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
25216 assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast
<void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25216, __PRETTY_FUNCTION__))
;
25217
25218 EVT MemVT = MemIntr->getMemoryVT();
25219
25220 uint16_t TruncationOp = IntrData->Opc0;
25221 switch (TruncationOp) {
25222 case X86ISD::VTRUNC: {
25223 if (isAllOnesConstant(Mask)) // return just a truncate store
25224 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
25225 MemIntr->getMemOperand());
25226
25227 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
25228 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25229 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
25230
25231 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
25232 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
25233 true /* truncating */);
25234 }
25235 case X86ISD::VTRUNCUS:
25236 case X86ISD::VTRUNCS: {
25237 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
25238 if (isAllOnesConstant(Mask))
25239 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
25240 MemIntr->getMemOperand(), DAG);
25241
25242 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
25243 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25244
25245 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
25246 VMask, MemVT, MemIntr->getMemOperand(), DAG);
25247 }
25248 default:
25249 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25249)
;
25250 }
25251 }
25252 }
25253}
25254
25255SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
25256 SelectionDAG &DAG) const {
25257 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
25258 MFI.setReturnAddressIsTaken(true);
25259
25260 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
25261 return SDValue();
25262
25263 unsigned Depth = Op.getConstantOperandVal(0);
25264 SDLoc dl(Op);
25265 EVT PtrVT = getPointerTy(DAG.getDataLayout());
25266
25267 if (Depth > 0) {
25268 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
25269 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25270 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
25271 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
25272 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
25273 MachinePointerInfo());
25274 }
25275
25276 // Just load the return address.
25277 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
25278 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
25279 MachinePointerInfo());
25280}
25281
25282SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
25283 SelectionDAG &DAG) const {
25284 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
25285 return getReturnAddressFrameIndex(DAG);
25286}
25287
25288SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
25289 MachineFunction &MF = DAG.getMachineFunction();
25290 MachineFrameInfo &MFI = MF.getFrameInfo();
25291 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25292 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25293 EVT VT = Op.getValueType();
25294
25295 MFI.setFrameAddressIsTaken(true);
25296
25297 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
25298 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
25299 // is not possible to crawl up the stack without looking at the unwind codes
25300 // simultaneously.
25301 int FrameAddrIndex = FuncInfo->getFAIndex();
25302 if (!FrameAddrIndex) {
25303 // Set up a frame object for the return address.
25304 unsigned SlotSize = RegInfo->getSlotSize();
25305 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
25306 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
25307 FuncInfo->setFAIndex(FrameAddrIndex);
25308 }
25309 return DAG.getFrameIndex(FrameAddrIndex, VT);
25310 }
25311
25312 unsigned FrameReg =
25313 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
25314 SDLoc dl(Op); // FIXME probably not meaningful
25315 unsigned Depth = Op.getConstantOperandVal(0);
25316 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25318, __PRETTY_FUNCTION__))
25317 (FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25318, __PRETTY_FUNCTION__))
25318 "Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25318, __PRETTY_FUNCTION__))
;
25319 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
25320 while (Depth--)
25321 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
25322 MachinePointerInfo());
25323 return FrameAddr;
25324}
25325
25326// FIXME? Maybe this could be a TableGen attribute on some registers and
25327// this table could be generated automatically from RegInfo.
25328Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
25329 const MachineFunction &MF) const {
25330 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25331
25332 Register Reg = StringSwitch<unsigned>(RegName)
25333 .Case("esp", X86::ESP)
25334 .Case("rsp", X86::RSP)
25335 .Case("ebp", X86::EBP)
25336 .Case("rbp", X86::RBP)
25337 .Default(0);
25338
25339 if (Reg == X86::EBP || Reg == X86::RBP) {
25340 if (!TFI.hasFP(MF))
25341 report_fatal_error("register " + StringRef(RegName) +
25342 " is allocatable: function has no frame pointer");
25343#ifndef NDEBUG
25344 else {
25345 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25346 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
25347 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25348, __PRETTY_FUNCTION__))
25348 "Invalid Frame Register!")(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25348, __PRETTY_FUNCTION__))
;
25349 }
25350#endif
25351 }
25352
25353 if (Reg)
25354 return Reg;
25355
25356 report_fatal_error("Invalid register name global variable");
25357}
25358
25359SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
25360 SelectionDAG &DAG) const {
25361 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25362 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
25363}
25364
25365unsigned X86TargetLowering::getExceptionPointerRegister(
25366 const Constant *PersonalityFn) const {
25367 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
25368 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
25369
25370 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
25371}
25372
25373unsigned X86TargetLowering::getExceptionSelectorRegister(
25374 const Constant *PersonalityFn) const {
25375 // Funclet personalities don't use selectors (the runtime does the selection).
25376 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn
))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25376, __PRETTY_FUNCTION__))
;
25377 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
25378}
25379
25380bool X86TargetLowering::needsFixedCatchObjects() const {
25381 return Subtarget.isTargetWin64();
25382}
25383
25384SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
25385 SDValue Chain = Op.getOperand(0);
25386 SDValue Offset = Op.getOperand(1);
25387 SDValue Handler = Op.getOperand(2);
25388 SDLoc dl (Op);
25389
25390 EVT PtrVT = getPointerTy(DAG.getDataLayout());
25391 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25392 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
25393 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25395, __PRETTY_FUNCTION__))
25394 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25395, __PRETTY_FUNCTION__))
25395 "Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25395, __PRETTY_FUNCTION__))
;
25396 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
25397 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
25398
25399 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
25400 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
25401 dl));
25402 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
25403 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
25404 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
25405
25406 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
25407 DAG.getRegister(StoreAddrReg, PtrVT));
25408}
25409
25410SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
25411 SelectionDAG &DAG) const {
25412 SDLoc DL(Op);
25413 // If the subtarget is not 64bit, we may need the global base reg
25414 // after isel expand pseudo, i.e., after CGBR pass ran.
25415 // Therefore, ask for the GlobalBaseReg now, so that the pass
25416 // inserts the code for us in case we need it.
25417 // Otherwise, we will end up in a situation where we will
25418 // reference a virtual register that is not defined!
25419 if (!Subtarget.is64Bit()) {
25420 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25421 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
25422 }
25423 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
25424 DAG.getVTList(MVT::i32, MVT::Other),
25425 Op.getOperand(0), Op.getOperand(1));
25426}
25427
25428SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
25429 SelectionDAG &DAG) const {
25430 SDLoc DL(Op);
25431 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
25432 Op.getOperand(0), Op.getOperand(1));
25433}
25434
25435SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
25436 SelectionDAG &DAG) const {
25437 SDLoc DL(Op);
25438 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
25439 Op.getOperand(0));
25440}
25441
25442static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
25443 return Op.getOperand(0);
25444}
25445
25446SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
25447 SelectionDAG &DAG) const {
25448 SDValue Root = Op.getOperand(0);
25449 SDValue Trmp = Op.getOperand(1); // trampoline
25450 SDValue FPtr = Op.getOperand(2); // nested function
25451 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
25452 SDLoc dl (Op);
25453
25454 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25455 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25456
25457 if (Subtarget.is64Bit()) {
25458 SDValue OutChains[6];
25459
25460 // Large code-model.
25461 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
25462 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
25463
25464 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
25465 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
25466
25467 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
25468
25469 // Load the pointer to the nested function into R11.
25470 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
25471 SDValue Addr = Trmp;
25472 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25473 Addr, MachinePointerInfo(TrmpAddr));
25474
25475 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25476 DAG.getConstant(2, dl, MVT::i64));
25477 OutChains[1] =
25478 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
25479 /* Alignment = */ 2);
25480
25481 // Load the 'nest' parameter value into R10.
25482 // R10 is specified in X86CallingConv.td
25483 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
25484 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25485 DAG.getConstant(10, dl, MVT::i64));
25486 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25487 Addr, MachinePointerInfo(TrmpAddr, 10));
25488
25489 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25490 DAG.getConstant(12, dl, MVT::i64));
25491 OutChains[3] =
25492 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
25493 /* Alignment = */ 2);
25494
25495 // Jump to the nested function.
25496 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
25497 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25498 DAG.getConstant(20, dl, MVT::i64));
25499 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25500 Addr, MachinePointerInfo(TrmpAddr, 20));
25501
25502 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
25503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25504 DAG.getConstant(22, dl, MVT::i64));
25505 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
25506 Addr, MachinePointerInfo(TrmpAddr, 22));
25507
25508 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
25509 } else {
25510 const Function *Func =
25511 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
25512 CallingConv::ID CC = Func->getCallingConv();
25513 unsigned NestReg;
25514
25515 switch (CC) {
25516 default:
25517 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25517)
;
25518 case CallingConv::C:
25519 case CallingConv::X86_StdCall: {
25520 // Pass 'nest' parameter in ECX.
25521 // Must be kept in sync with X86CallingConv.td
25522 NestReg = X86::ECX;
25523
25524 // Check that ECX wasn't needed by an 'inreg' parameter.
25525 FunctionType *FTy = Func->getFunctionType();
25526 const AttributeList &Attrs = Func->getAttributes();
25527
25528 if (!Attrs.isEmpty() && !Func->isVarArg()) {
25529 unsigned InRegCount = 0;
25530 unsigned Idx = 1;
25531
25532 for (FunctionType::param_iterator I = FTy->param_begin(),
25533 E = FTy->param_end(); I != E; ++I, ++Idx)
25534 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
25535 auto &DL = DAG.getDataLayout();
25536 // FIXME: should only count parameters that are lowered to integers.
25537 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
25538 }
25539
25540 if (InRegCount > 2) {
25541 report_fatal_error("Nest register in use - reduce number of inreg"
25542 " parameters!");
25543 }
25544 }
25545 break;
25546 }
25547 case CallingConv::X86_FastCall:
25548 case CallingConv::X86_ThisCall:
25549 case CallingConv::Fast:
25550 case CallingConv::Tail:
25551 // Pass 'nest' parameter in EAX.
25552 // Must be kept in sync with X86CallingConv.td
25553 NestReg = X86::EAX;
25554 break;
25555 }
25556
25557 SDValue OutChains[4];
25558 SDValue Addr, Disp;
25559
25560 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25561 DAG.getConstant(10, dl, MVT::i32));
25562 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
25563
25564 // This is storing the opcode for MOV32ri.
25565 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
25566 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
25567 OutChains[0] =
25568 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
25569 Trmp, MachinePointerInfo(TrmpAddr));
25570
25571 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25572 DAG.getConstant(1, dl, MVT::i32));
25573 OutChains[1] =
25574 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
25575 /* Alignment = */ 1);
25576
25577 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
25578 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25579 DAG.getConstant(5, dl, MVT::i32));
25580 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
25581 Addr, MachinePointerInfo(TrmpAddr, 5),
25582 /* Alignment = */ 1);
25583
25584 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25585 DAG.getConstant(6, dl, MVT::i32));
25586 OutChains[3] =
25587 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
25588 /* Alignment = */ 1);
25589
25590 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
25591 }
25592}
25593
25594SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
25595 SelectionDAG &DAG) const {
25596 /*
25597 The rounding mode is in bits 11:10 of FPSR, and has the following
25598 settings:
25599 00 Round to nearest
25600 01 Round to -inf
25601 10 Round to +inf
25602 11 Round to 0
25603
25604 FLT_ROUNDS, on the other hand, expects the following:
25605 -1 Undefined
25606 0 Round to 0
25607 1 Round to nearest
25608 2 Round to +inf
25609 3 Round to -inf
25610
25611 To perform the conversion, we use a packed lookup table of the four 2-bit
25612 values that we can index by FPSP[11:10]
25613 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
25614
25615 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
25616 */
25617
25618 MachineFunction &MF = DAG.getMachineFunction();
25619 MVT VT = Op.getSimpleValueType();
25620 SDLoc DL(Op);
25621
25622 // Save FP Control Word to stack slot
25623 int SSFI =
25624 MF.getFrameInfo().CreateStackObject(2, 2, false);
25625 SDValue StackSlot =
25626 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
25627
25628 MachineMemOperand *MMO =
25629 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
25630 MachineMemOperand::MOStore, 2, 2);
25631
25632 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
25633 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
25634 DAG.getVTList(MVT::Other),
25635 Ops, MVT::i16, MMO);
25636
25637 // Load FP Control Word from stack slot
25638 SDValue CWD =
25639 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
25640
25641 // Mask and turn the control bits into a shift for the lookup table.
25642 SDValue Shift =
25643 DAG.getNode(ISD::SRL, DL, MVT::i16,
25644 DAG.getNode(ISD::AND, DL, MVT::i16,
25645 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
25646 DAG.getConstant(9, DL, MVT::i8));
25647 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
25648
25649 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
25650 SDValue RetVal =
25651 DAG.getNode(ISD::AND, DL, MVT::i32,
25652 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
25653 DAG.getConstant(3, DL, MVT::i32));
25654
25655 return DAG.getZExtOrTrunc(RetVal, DL, VT);
25656}
25657
25658// Split an unary integer op into 2 half sized ops.
25659static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
25660 MVT VT = Op.getSimpleValueType();
25661 unsigned NumElems = VT.getVectorNumElements();
25662 unsigned SizeInBits = VT.getSizeInBits();
25663 MVT EltVT = VT.getVectorElementType();
25664 SDValue Src = Op.getOperand(0);
25665 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25666, __PRETTY_FUNCTION__))
25666 "Src and Op should have the same element type!")((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25666, __PRETTY_FUNCTION__))
;
25667
25668 // Extract the Lo/Hi vectors
25669 SDLoc dl(Op);
25670 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
25671 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
25672
25673 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
25674 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25675 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
25676 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
25677}
25678
25679// Decompose 256-bit ops into smaller 128-bit ops.
25680static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
25681 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25683, __PRETTY_FUNCTION__))
25682 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25683, __PRETTY_FUNCTION__))
25683 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25683, __PRETTY_FUNCTION__))
;
25684 return LowerVectorIntUnary(Op, DAG);
25685}
25686
25687// Decompose 512-bit ops into smaller 256-bit ops.
25688static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
25689 assert(Op.getSimpleValueType().is512BitVector() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25691, __PRETTY_FUNCTION__))
25690 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25691, __PRETTY_FUNCTION__))
25691 "Only handle AVX 512-bit vector integer operation")((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25691, __PRETTY_FUNCTION__))
;
25692 return LowerVectorIntUnary(Op, DAG);
25693}
25694
25695/// Lower a vector CTLZ using native supported vector CTLZ instruction.
25696//
25697// i8/i16 vector implemented using dword LZCNT vector instruction
25698// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
25699// split the vector, perform operation on it's Lo a Hi part and
25700// concatenate the results.
25701static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
25702 const X86Subtarget &Subtarget) {
25703 assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) :
__assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25703, __PRETTY_FUNCTION__))
;
25704 SDLoc dl(Op);
25705 MVT VT = Op.getSimpleValueType();
25706 MVT EltVT = VT.getVectorElementType();
25707 unsigned NumElems = VT.getVectorNumElements();
25708
25709 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25710, __PRETTY_FUNCTION__))
25710 "Unsupported element type")(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25710, __PRETTY_FUNCTION__))
;
25711
25712 // Split vector, it's Lo and Hi parts will be handled in next iteration.
25713 if (NumElems > 16 ||
25714 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
25715 return LowerVectorIntUnary(Op, DAG);
25716
25717 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
25718 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25719, __PRETTY_FUNCTION__))
25719 "Unsupported value type for operation")(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25719, __PRETTY_FUNCTION__))
;
25720
25721 // Use native supported vector instruction vplzcntd.
25722 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
25723 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
25724 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
25725 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
25726
25727 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
25728}
25729
25730// Lower CTLZ using a PSHUFB lookup table implementation.
25731static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
25732 const X86Subtarget &Subtarget,
25733 SelectionDAG &DAG) {
25734 MVT VT = Op.getSimpleValueType();
25735 int NumElts = VT.getVectorNumElements();
25736 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
25737 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
25738
25739 // Per-nibble leading zero PSHUFB lookup table.
25740 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
25741 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
25742 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
25743 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
25744
25745 SmallVector<SDValue, 64> LUTVec;
25746 for (int i = 0; i < NumBytes; ++i)
25747 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
25748 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
25749
25750 // Begin by bitcasting the input to byte vector, then split those bytes
25751 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
25752 // If the hi input nibble is zero then we add both results together, otherwise
25753 // we just take the hi result (by masking the lo result to zero before the
25754 // add).
25755 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
25756 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
25757
25758 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
25759 SDValue Lo = Op0;
25760 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
25761 SDValue HiZ;
25762 if (CurrVT.is512BitVector()) {
25763 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
25764 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
25765 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
25766 } else {
25767 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
25768 }
25769
25770 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
25771 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
25772 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
25773 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
25774
25775 // Merge result back from vXi8 back to VT, working on the lo/hi halves
25776 // of the current vector width in the same way we did for the nibbles.
25777 // If the upper half of the input element is zero then add the halves'
25778 // leading zero counts together, otherwise just use the upper half's.
25779 // Double the width of the result until we are at target width.
25780 while (CurrVT != VT) {
25781 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
25782 int CurrNumElts = CurrVT.getVectorNumElements();
25783 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
25784 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
25785 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
25786
25787 // Check if the upper half of the input element is zero.
25788 if (CurrVT.is512BitVector()) {
25789 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
25790 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
25791 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
25792 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
25793 } else {
25794 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
25795 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
25796 }
25797 HiZ = DAG.getBitcast(NextVT, HiZ);
25798
25799 // Move the upper/lower halves to the lower bits as we'll be extending to
25800 // NextVT. Mask the lower result to zero if HiZ is true and add the results
25801 // together.
25802 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
25803 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
25804 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
25805 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
25806 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
25807 CurrVT = NextVT;
25808 }
25809
25810 return Res;
25811}
25812
25813static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
25814 const X86Subtarget &Subtarget,
25815 SelectionDAG &DAG) {
25816 MVT VT = Op.getSimpleValueType();
25817
25818 if (Subtarget.hasCDI() &&
25819 // vXi8 vectors need to be promoted to 512-bits for vXi32.
25820 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
25821 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
25822
25823 // Decompose 256-bit ops into smaller 128-bit ops.
25824 if (VT.is256BitVector() && !Subtarget.hasInt256())
25825 return Lower256IntUnary(Op, DAG);
25826
25827 // Decompose 512-bit ops into smaller 256-bit ops.
25828 if (VT.is512BitVector() && !Subtarget.hasBWI())
25829 return Lower512IntUnary(Op, DAG);
25830
25831 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25831, __PRETTY_FUNCTION__))
;
25832 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
25833}
25834
25835static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
25836 SelectionDAG &DAG) {
25837 MVT VT = Op.getSimpleValueType();
25838 MVT OpVT = VT;
25839 unsigned NumBits = VT.getSizeInBits();
25840 SDLoc dl(Op);
25841 unsigned Opc = Op.getOpcode();
25842
25843 if (VT.isVector())
25844 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
25845
25846 Op = Op.getOperand(0);
25847 if (VT == MVT::i8) {
25848 // Zero extend to i32 since there is not an i8 bsr.
25849 OpVT = MVT::i32;
25850 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
25851 }
25852
25853 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
25854 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
25855 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
25856
25857 if (Opc == ISD::CTLZ) {
25858 // If src is zero (i.e. bsr sets ZF), returns NumBits.
25859 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
25860 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
25861 Op.getValue(1)};
25862 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
25863 }
25864
25865 // Finally xor with NumBits-1.
25866 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
25867 DAG.getConstant(NumBits - 1, dl, OpVT));
25868
25869 if (VT == MVT::i8)
25870 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
25871 return Op;
25872}
25873
25874static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
25875 SelectionDAG &DAG) {
25876 MVT VT = Op.getSimpleValueType();
25877 unsigned NumBits = VT.getScalarSizeInBits();
25878 SDValue N0 = Op.getOperand(0);
25879 SDLoc dl(Op);
25880
25881 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25882, __PRETTY_FUNCTION__))
25882 "Only scalar CTTZ requires custom lowering")((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25882, __PRETTY_FUNCTION__))
;
25883
25884 // Issue a bsf (scan bits forward) which also sets EFLAGS.
25885 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
25886 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
25887
25888 // If src is zero (i.e. bsf sets ZF), returns NumBits.
25889 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
25890 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
25891 Op.getValue(1)};
25892 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
25893}
25894
25895/// Break a 256-bit integer operation into two new 128-bit ones and then
25896/// concatenate the result back.
25897static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
25898 MVT VT = Op.getSimpleValueType();
25899
25900 assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25901, __PRETTY_FUNCTION__))
25901 "Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25901, __PRETTY_FUNCTION__))
;
25902
25903 unsigned NumElems = VT.getVectorNumElements();
25904 SDLoc dl(Op);
25905
25906 // Extract the LHS vectors
25907 SDValue LHS = Op.getOperand(0);
25908 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
25909 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
25910
25911 // Extract the RHS vectors
25912 SDValue RHS = Op.getOperand(1);
25913 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
25914 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
25915
25916 MVT EltVT = VT.getVectorElementType();
25917 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
25918
25919 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25920 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
25921 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
25922}
25923
25924/// Break a 512-bit integer operation into two new 256-bit ones and then
25925/// concatenate the result back.
25926static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
25927 MVT VT = Op.getSimpleValueType();
25928
25929 assert(VT.is512BitVector() && VT.isInteger() &&((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25930, __PRETTY_FUNCTION__))
25930 "Unsupported value type for operation")((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25930, __PRETTY_FUNCTION__))
;
25931
25932 unsigned NumElems = VT.getVectorNumElements();
25933 SDLoc dl(Op);
25934
25935 // Extract the LHS vectors
25936 SDValue LHS = Op.getOperand(0);
25937 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
25938 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
25939
25940 // Extract the RHS vectors
25941 SDValue RHS = Op.getOperand(1);
25942 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
25943 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
25944
25945 MVT EltVT = VT.getVectorElementType();
25946 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
25947
25948 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25949 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
25950 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
25951}
25952
25953static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
25954 const X86Subtarget &Subtarget) {
25955 MVT VT = Op.getSimpleValueType();
25956 if (VT == MVT::i16 || VT == MVT::i32)
25957 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
25958
25959 if (VT.getScalarType() == MVT::i1)
25960 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
25961 Op.getOperand(0), Op.getOperand(1));
25962
25963 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25965, __PRETTY_FUNCTION__))
25964 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25965, __PRETTY_FUNCTION__))
25965 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25965, __PRETTY_FUNCTION__))
;
25966 return split256IntArith(Op, DAG);
25967}
25968
25969static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
25970 const X86Subtarget &Subtarget) {
25971 MVT VT = Op.getSimpleValueType();
25972 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
25973 unsigned Opcode = Op.getOpcode();
25974 if (VT.getScalarType() == MVT::i1) {
25975 SDLoc dl(Op);
25976 switch (Opcode) {
25977 default: llvm_unreachable("Expected saturated arithmetic opcode")::llvm::llvm_unreachable_internal("Expected saturated arithmetic opcode"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25977)
;
25978 case ISD::UADDSAT:
25979 case ISD::SADDSAT:
25980 // *addsat i1 X, Y --> X | Y
25981 return DAG.getNode(ISD::OR, dl, VT, X, Y);
25982 case ISD::USUBSAT:
25983 case ISD::SSUBSAT:
25984 // *subsat i1 X, Y --> X & ~Y
25985 return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
25986 }
25987 }
25988
25989 if (VT.is128BitVector()) {
25990 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
25991 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25992 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
25993 *DAG.getContext(), VT);
25994 SDLoc DL(Op);
25995 if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
25996 // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
25997 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
25998 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
25999 return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
26000 }
26001 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
26002 // usubsat X, Y --> (X >u Y) ? X - Y : 0
26003 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
26004 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
26005 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
26006 }
26007 // Use default expansion.
26008 return SDValue();
26009 }
26010
26011 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26013, __PRETTY_FUNCTION__))
26012 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26013, __PRETTY_FUNCTION__))
26013 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26013, __PRETTY_FUNCTION__))
;
26014 return split256IntArith(Op, DAG);
26015}
26016
26017static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
26018 SelectionDAG &DAG) {
26019 MVT VT = Op.getSimpleValueType();
26020 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
26021 // Since X86 does not have CMOV for 8-bit integer, we don't convert
26022 // 8-bit integer abs to NEG and CMOV.
26023 SDLoc DL(Op);
26024 SDValue N0 = Op.getOperand(0);
26025 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
26026 DAG.getConstant(0, DL, VT), N0);
26027 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
26028 SDValue(Neg.getNode(), 1)};
26029 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
26030 }
26031
26032 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
26033 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
26034 SDLoc DL(Op);
26035 SDValue Src = Op.getOperand(0);
26036 SDValue Sub =
26037 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
26038 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
26039 }
26040
26041 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
26042 assert(VT.isInteger() &&((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26043, __PRETTY_FUNCTION__))
26043 "Only handle AVX 256-bit vector integer operation")((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26043, __PRETTY_FUNCTION__))
;
26044 return Lower256IntUnary(Op, DAG);
26045 }
26046
26047 // Default to expand.
26048 return SDValue();
26049}
26050
26051static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
26052 MVT VT = Op.getSimpleValueType();
26053
26054 // For AVX1 cases, split to use legal ops (everything but v4i64).
26055 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
26056 return split256IntArith(Op, DAG);
26057
26058 SDLoc DL(Op);
26059 unsigned Opcode = Op.getOpcode();
26060 SDValue N0 = Op.getOperand(0);
26061 SDValue N1 = Op.getOperand(1);
26062
26063 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
26064 // using the SMIN/SMAX instructions and flipping the signbit back.
26065 if (VT == MVT::v8i16) {
26066 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26067, __PRETTY_FUNCTION__))
26067 "Unexpected MIN/MAX opcode")(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26067, __PRETTY_FUNCTION__))
;
26068 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
26069 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
26070 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
26071 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
26072 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
26073 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
26074 }
26075
26076 // Else, expand to a compare/select.
26077 ISD::CondCode CC;
26078 switch (Opcode) {
26079 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
26080 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
26081 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
26082 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
26083 default: llvm_unreachable("Unknown MINMAX opcode")::llvm::llvm_unreachable_internal("Unknown MINMAX opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26083)
;
26084 }
26085
26086 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
26087 return DAG.getSelect(DL, VT, Cond, N0, N1);
26088}
26089
26090static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
26091 SelectionDAG &DAG) {
26092 SDLoc dl(Op);
26093 MVT VT = Op.getSimpleValueType();
26094
26095 if (VT.getScalarType() == MVT::i1)
26096 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
26097
26098 // Decompose 256-bit ops into 128-bit ops.
26099 if (VT.is256BitVector() && !Subtarget.hasInt256())
26100 return split256IntArith(Op, DAG);
26101
26102 SDValue A = Op.getOperand(0);
26103 SDValue B = Op.getOperand(1);
26104
26105 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
26106 // vector pairs, multiply and truncate.
26107 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
26108 unsigned NumElts = VT.getVectorNumElements();
26109
26110 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
26111 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
26112 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
26113 return DAG.getNode(
26114 ISD::TRUNCATE, dl, VT,
26115 DAG.getNode(ISD::MUL, dl, ExVT,
26116 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
26117 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
26118 }
26119
26120 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26121
26122 // Extract the lo/hi parts to any extend to i16.
26123 // We're going to mask off the low byte of each result element of the
26124 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
26125 // element.
26126 SDValue Undef = DAG.getUNDEF(VT);
26127 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
26128 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
26129
26130 SDValue BLo, BHi;
26131 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
26132 // If the LHS is a constant, manually unpackl/unpackh.
26133 SmallVector<SDValue, 16> LoOps, HiOps;
26134 for (unsigned i = 0; i != NumElts; i += 16) {
26135 for (unsigned j = 0; j != 8; ++j) {
26136 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
26137 MVT::i16));
26138 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
26139 MVT::i16));
26140 }
26141 }
26142
26143 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
26144 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
26145 } else {
26146 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
26147 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
26148 }
26149
26150 // Multiply, mask the lower 8bits of the lo/hi results and pack.
26151 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26152 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26153 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
26154 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
26155 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
26156 }
26157
26158 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
26159 if (VT == MVT::v4i32) {
26160 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26161, __PRETTY_FUNCTION__))
26161 "Should not custom lower when pmulld is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26161, __PRETTY_FUNCTION__))
;
26162
26163 // Extract the odd parts.
26164 static const int UnpackMask[] = { 1, -1, 3, -1 };
26165 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
26166 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
26167
26168 // Multiply the even parts.
26169 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
26170 DAG.getBitcast(MVT::v2i64, A),
26171 DAG.getBitcast(MVT::v2i64, B));
26172 // Now multiply odd parts.
26173 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
26174 DAG.getBitcast(MVT::v2i64, Aodds),
26175 DAG.getBitcast(MVT::v2i64, Bodds));
26176
26177 Evens = DAG.getBitcast(VT, Evens);
26178 Odds = DAG.getBitcast(VT, Odds);
26179
26180 // Merge the two vectors back together with a shuffle. This expands into 2
26181 // shuffles.
26182 static const int ShufMask[] = { 0, 4, 2, 6 };
26183 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
26184 }
26185
26186 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26187, __PRETTY_FUNCTION__))
26187 "Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26187, __PRETTY_FUNCTION__))
;
26188 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((!Subtarget.hasDQI() && "DQI should use MULLQ") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26188, __PRETTY_FUNCTION__))
;
26189
26190 // Ahi = psrlqi(a, 32);
26191 // Bhi = psrlqi(b, 32);
26192 //
26193 // AloBlo = pmuludq(a, b);
26194 // AloBhi = pmuludq(a, Bhi);
26195 // AhiBlo = pmuludq(Ahi, b);
26196 //
26197 // Hi = psllqi(AloBhi + AhiBlo, 32);
26198 // return AloBlo + Hi;
26199 KnownBits AKnown = DAG.computeKnownBits(A);
26200 KnownBits BKnown = DAG.computeKnownBits(B);
26201
26202 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
26203 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
26204 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
26205
26206 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
26207 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
26208 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
26209
26210 SDValue Zero = DAG.getConstant(0, dl, VT);
26211
26212 // Only multiply lo/hi halves that aren't known to be zero.
26213 SDValue AloBlo = Zero;
26214 if (!ALoIsZero && !BLoIsZero)
26215 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
26216
26217 SDValue AloBhi = Zero;
26218 if (!ALoIsZero && !BHiIsZero) {
26219 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
26220 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
26221 }
26222
26223 SDValue AhiBlo = Zero;
26224 if (!AHiIsZero && !BLoIsZero) {
26225 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
26226 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
26227 }
26228
26229 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
26230 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
26231
26232 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
26233}
26234
26235static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
26236 SelectionDAG &DAG) {
26237 SDLoc dl(Op);
26238 MVT VT = Op.getSimpleValueType();
26239 bool IsSigned = Op->getOpcode() == ISD::MULHS;
26240 unsigned NumElts = VT.getVectorNumElements();
26241 SDValue A = Op.getOperand(0);
26242 SDValue B = Op.getOperand(1);
26243
26244 // Decompose 256-bit ops into 128-bit ops.
26245 if (VT.is256BitVector() && !Subtarget.hasInt256())
26246 return split256IntArith(Op, DAG);
26247
26248 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
26249 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26251, __PRETTY_FUNCTION__))
26250 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26251, __PRETTY_FUNCTION__))
26251 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26251, __PRETTY_FUNCTION__))
;
26252
26253 // PMULxD operations multiply each even value (starting at 0) of LHS with
26254 // the related value of RHS and produce a widen result.
26255 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
26256 // => <2 x i64> <ae|cg>
26257 //
26258 // In other word, to have all the results, we need to perform two PMULxD:
26259 // 1. one with the even values.
26260 // 2. one with the odd values.
26261 // To achieve #2, with need to place the odd values at an even position.
26262 //
26263 // Place the odd value at an even position (basically, shift all values 1
26264 // step to the left):
26265 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
26266 9, -1, 11, -1, 13, -1, 15, -1};
26267 // <a|b|c|d> => <b|undef|d|undef>
26268 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
26269 makeArrayRef(&Mask[0], NumElts));
26270 // <e|f|g|h> => <f|undef|h|undef>
26271 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
26272 makeArrayRef(&Mask[0], NumElts));
26273
26274 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
26275 // ints.
26276 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
26277 unsigned Opcode =
26278 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
26279 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
26280 // => <2 x i64> <ae|cg>
26281 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
26282 DAG.getBitcast(MulVT, A),
26283 DAG.getBitcast(MulVT, B)));
26284 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
26285 // => <2 x i64> <bf|dh>
26286 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
26287 DAG.getBitcast(MulVT, Odd0),
26288 DAG.getBitcast(MulVT, Odd1)));
26289
26290 // Shuffle it back into the right order.
26291 SmallVector<int, 16> ShufMask(NumElts);
26292 for (int i = 0; i != (int)NumElts; ++i)
26293 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
26294
26295 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
26296
26297 // If we have a signed multiply but no PMULDQ fix up the result of an
26298 // unsigned multiply.
26299 if (IsSigned && !Subtarget.hasSSE41()) {
26300 SDValue Zero = DAG.getConstant(0, dl, VT);
26301 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
26302 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
26303 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
26304 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
26305
26306 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
26307 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
26308 }
26309
26310 return Res;
26311 }
26312
26313 // Only i8 vectors should need custom lowering after this.
26314 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26316, __PRETTY_FUNCTION__))
26315 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26316, __PRETTY_FUNCTION__))
26316 "Unsupported vector type")(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26316, __PRETTY_FUNCTION__))
;
26317
26318 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
26319 // logical shift down the upper half and pack back to i8.
26320
26321 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
26322 // and then ashr/lshr the upper bits down to the lower bits before multiply.
26323 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26324
26325 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
26326 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
26327 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
26328 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
26329 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
26330 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
26331 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
26332 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
26333 }
26334
26335 // For signed 512-bit vectors, split into 256-bit vectors to allow the
26336 // sign-extension to occur.
26337 if (VT == MVT::v64i8 && IsSigned)
26338 return split512IntArith(Op, DAG);
26339
26340 // Signed AVX2 implementation - extend xmm subvectors to ymm.
26341 if (VT == MVT::v32i8 && IsSigned) {
26342 MVT ExVT = MVT::v16i16;
26343 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
26344 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
26345 SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
26346 SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
26347 ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
26348 BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
26349 AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
26350 BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
26351 SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26352 SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26353 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
26354 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
26355
26356 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
26357 // Shuffle lowering should turn this into PACKUS+PERMQ
26358 Lo = DAG.getBitcast(VT, Lo);
26359 Hi = DAG.getBitcast(VT, Hi);
26360 return DAG.getVectorShuffle(VT, dl, Lo, Hi,
26361 { 0, 2, 4, 6, 8, 10, 12, 14,
26362 16, 18, 20, 22, 24, 26, 28, 30,
26363 32, 34, 36, 38, 40, 42, 44, 46,
26364 48, 50, 52, 54, 56, 58, 60, 62});
26365 }
26366
26367 // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
26368 // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
26369 // shift the results and pack the half lane results back together.
26370
26371 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26372
26373 static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
26374 -1, -1, -1, -1, -1, -1, -1, -1};
26375
26376 // Extract the lo parts and zero/sign extend to i16.
26377 // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
26378 // shifts to sign extend. Using unpack for unsigned only requires an xor to
26379 // create zeros and a copy due to tied registers contraints pre-avx. But using
26380 // zero_extend_vector_inreg would require an additional pshufd for the high
26381 // part.
26382
26383 SDValue ALo, AHi;
26384 if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
26385 ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
26386
26387 AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
26388 AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
26389 } else if (IsSigned) {
26390 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
26391 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
26392
26393 ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
26394 AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
26395 } else {
26396 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
26397 DAG.getConstant(0, dl, VT)));
26398 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
26399 DAG.getConstant(0, dl, VT)));
26400 }
26401
26402 SDValue BLo, BHi;
26403 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
26404 // If the LHS is a constant, manually unpackl/unpackh and extend.
26405 SmallVector<SDValue, 16> LoOps, HiOps;
26406 for (unsigned i = 0; i != NumElts; i += 16) {
26407 for (unsigned j = 0; j != 8; ++j) {
26408 SDValue LoOp = B.getOperand(i + j);
26409 SDValue HiOp = B.getOperand(i + j + 8);
26410
26411 if (IsSigned) {
26412 LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
26413 HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
26414 } else {
26415 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
26416 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
26417 }
26418
26419 LoOps.push_back(LoOp);
26420 HiOps.push_back(HiOp);
26421 }
26422 }
26423
26424 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
26425 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
26426 } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
26427 BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
26428
26429 BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
26430 BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
26431 } else if (IsSigned) {
26432 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
26433 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
26434
26435 BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
26436 BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
26437 } else {
26438 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
26439 DAG.getConstant(0, dl, VT)));
26440 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
26441 DAG.getConstant(0, dl, VT)));
26442 }
26443
26444 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
26445 // pack back to vXi8.
26446 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26447 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26448 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
26449 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
26450
26451 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
26452 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
26453}
26454
26455SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
26456 assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26456, __PRETTY_FUNCTION__))
;
26457 EVT VT = Op.getValueType();
26458 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26459, __PRETTY_FUNCTION__))
26459 "Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26459, __PRETTY_FUNCTION__))
;
26460
26461 RTLIB::Libcall LC;
26462 bool isSigned;
26463 switch (Op->getOpcode()) {
26464 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26464)
;
26465 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
26466 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
26467 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
26468 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
26469 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
26470 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
26471 }
26472
26473 SDLoc dl(Op);
26474 SDValue InChain = DAG.getEntryNode();
26475
26476 TargetLowering::ArgListTy Args;
26477 TargetLowering::ArgListEntry Entry;
26478 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
26479 EVT ArgVT = Op->getOperand(i).getValueType();
26480 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26481, __PRETTY_FUNCTION__))
26481 "Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26481, __PRETTY_FUNCTION__))
;
26482 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
26483 Entry.Node = StackPtr;
26484 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
26485 MachinePointerInfo(), /* Alignment = */ 16);
26486 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
26487 Entry.Ty = PointerType::get(ArgTy,0);
26488 Entry.IsSExt = false;
26489 Entry.IsZExt = false;
26490 Args.push_back(Entry);
26491 }
26492
26493 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
26494 getPointerTy(DAG.getDataLayout()));
26495
26496 TargetLowering::CallLoweringInfo CLI(DAG);
26497 CLI.setDebugLoc(dl)
26498 .setChain(InChain)
26499 .setLibCallee(
26500 getLibcallCallingConv(LC),
26501 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
26502 std::move(Args))
26503 .setInRegister()
26504 .setSExtResult(isSigned)
26505 .setZExtResult(!isSigned);
26506
26507 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
26508 return DAG.getBitcast(VT, CallInfo.first);
26509}
26510
26511// Return true if the required (according to Opcode) shift-imm form is natively
26512// supported by the Subtarget
26513static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
26514 unsigned Opcode) {
26515 if (VT.getScalarSizeInBits() < 16)
26516 return false;
26517
26518 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
26519 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
26520 return true;
26521
26522 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
26523 (VT.is256BitVector() && Subtarget.hasInt256());
26524
26525 bool AShift = LShift && (Subtarget.hasAVX512() ||
26526 (VT != MVT::v2i64 && VT != MVT::v4i64));
26527 return (Opcode == ISD::SRA) ? AShift : LShift;
26528}
26529
26530// The shift amount is a variable, but it is the same for all vector lanes.
26531// These instructions are defined together with shift-immediate.
26532static
26533bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
26534 unsigned Opcode) {
26535 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
26536}
26537
26538// Return true if the required (according to Opcode) variable-shift form is
26539// natively supported by the Subtarget
26540static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
26541 unsigned Opcode) {
26542
26543 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
26544 return false;
26545
26546 // vXi16 supported only on AVX-512, BWI
26547 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
26548 return false;
26549
26550 if (Subtarget.hasAVX512())
26551 return true;
26552
26553 bool LShift = VT.is128BitVector() || VT.is256BitVector();
26554 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
26555 return (Opcode == ISD::SRA) ? AShift : LShift;
26556}
26557
26558static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
26559 const X86Subtarget &Subtarget) {
26560 MVT VT = Op.getSimpleValueType();
26561 SDLoc dl(Op);
26562 SDValue R = Op.getOperand(0);
26563 SDValue Amt = Op.getOperand(1);
26564 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
26565
26566 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
26567 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26567, __PRETTY_FUNCTION__))
;
26568 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
26569 SDValue Ex = DAG.getBitcast(ExVT, R);
26570
26571 // ashr(R, 63) === cmp_slt(R, 0)
26572 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
26573 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26574, __PRETTY_FUNCTION__))
26574 "Unsupported PCMPGT op")(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26574, __PRETTY_FUNCTION__))
;
26575 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
26576 }
26577
26578 if (ShiftAmt >= 32) {
26579 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
26580 SDValue Upper =
26581 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
26582 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
26583 ShiftAmt - 32, DAG);
26584 if (VT == MVT::v2i64)
26585 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
26586 if (VT == MVT::v4i64)
26587 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
26588 {9, 1, 11, 3, 13, 5, 15, 7});
26589 } else {
26590 // SRA upper i32, SRL whole i64 and select lower i32.
26591 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
26592 ShiftAmt, DAG);
26593 SDValue Lower =
26594 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
26595 Lower = DAG.getBitcast(ExVT, Lower);
26596 if (VT == MVT::v2i64)
26597 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
26598 if (VT == MVT::v4i64)
26599 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
26600 {8, 1, 10, 3, 12, 5, 14, 7});
26601 }
26602 return DAG.getBitcast(VT, Ex);
26603 };
26604
26605 // Optimize shl/srl/sra with constant shift amount.
26606 APInt APIntShiftAmt;
26607 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
26608 return SDValue();
26609
26610 // If the shift amount is out of range, return undef.
26611 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
26612 return DAG.getUNDEF(VT);
26613
26614 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
26615
26616 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
26617 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
26618
26619 // i64 SRA needs to be performed as partial shifts.
26620 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
26621 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
26622 Op.getOpcode() == ISD::SRA)
26623 return ArithmeticShiftRight64(ShiftAmt);
26624
26625 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
26626 VT == MVT::v64i8) {
26627 unsigned NumElts = VT.getVectorNumElements();
26628 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26629
26630 // Simple i8 add case
26631 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
26632 return DAG.getNode(ISD::ADD, dl, VT, R, R);
26633
26634 // ashr(R, 7) === cmp_slt(R, 0)
26635 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
26636 SDValue Zeros = DAG.getConstant(0, dl, VT);
26637 if (VT.is512BitVector()) {
26638 assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26638, __PRETTY_FUNCTION__))
;
26639 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
26640 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
26641 }
26642 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
26643 }
26644
26645 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
26646 if (VT == MVT::v16i8 && Subtarget.hasXOP())
26647 return SDValue();
26648
26649 if (Op.getOpcode() == ISD::SHL) {
26650 // Make a large shift.
26651 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
26652 ShiftAmt, DAG);
26653 SHL = DAG.getBitcast(VT, SHL);
26654 // Zero out the rightmost bits.
26655 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
26656 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
26657 }
26658 if (Op.getOpcode() == ISD::SRL) {
26659 // Make a large shift.
26660 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
26661 ShiftAmt, DAG);
26662 SRL = DAG.getBitcast(VT, SRL);
26663 // Zero out the leftmost bits.
26664 return DAG.getNode(ISD::AND, dl, VT, SRL,
26665 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
26666 }
26667 if (Op.getOpcode() == ISD::SRA) {
26668 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
26669 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
26670
26671 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
26672 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
26673 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
26674 return Res;
26675 }
26676 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26676)
;
26677 }
26678
26679 return SDValue();
26680}
26681
26682static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
26683 const X86Subtarget &Subtarget) {
26684 MVT VT = Op.getSimpleValueType();
26685 SDLoc dl(Op);
26686 SDValue R = Op.getOperand(0);
26687 SDValue Amt = Op.getOperand(1);
26688 unsigned Opcode = Op.getOpcode();
26689 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
26690 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
26691
26692 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
26693 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
26694 MVT EltVT = VT.getVectorElementType();
26695 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26695, __PRETTY_FUNCTION__))
;
26696 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
26697 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
26698 else if (EltVT.bitsLT(MVT::i32))
26699 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
26700
26701 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
26702 }
26703
26704 // vXi8 shifts - shift as v8i16 + mask result.
26705 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
26706 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
26707 VT == MVT::v64i8) &&
26708 !Subtarget.hasXOP()) {
26709 unsigned NumElts = VT.getVectorNumElements();
26710 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26711 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
26712 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
26713 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
26714 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
26715
26716 // Create the mask using vXi16 shifts. For shift-rights we need to move
26717 // the upper byte down before splatting the vXi8 mask.
26718 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
26719 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
26720 BaseShAmt, Subtarget, DAG);
26721 if (Opcode != ISD::SHL)
26722 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
26723 8, DAG);
26724 BitMask = DAG.getBitcast(VT, BitMask);
26725 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
26726 SmallVector<int, 64>(NumElts, 0));
26727
26728 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
26729 DAG.getBitcast(ExtVT, R), BaseShAmt,
26730 Subtarget, DAG);
26731 Res = DAG.getBitcast(VT, Res);
26732 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
26733
26734 if (Opcode == ISD::SRA) {
26735 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
26736 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
26737 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
26738 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
26739 BaseShAmt, Subtarget, DAG);
26740 SignMask = DAG.getBitcast(VT, SignMask);
26741 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
26742 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
26743 }
26744 return Res;
26745 }
26746 }
26747 }
26748
26749 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
26750 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
26751 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
26752 Amt = Amt.getOperand(0);
26753 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
26754 std::vector<SDValue> Vals(Ratio);
26755 for (unsigned i = 0; i != Ratio; ++i)
26756 Vals[i] = Amt.getOperand(i);
26757 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
26758 for (unsigned j = 0; j != Ratio; ++j)
26759 if (Vals[j] != Amt.getOperand(i + j))
26760 return SDValue();
26761 }
26762
26763 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
26764 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
26765 }
26766 return SDValue();
26767}
26768
26769// Convert a shift/rotate left amount to a multiplication scale factor.
26770static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
26771 const X86Subtarget &Subtarget,
26772 SelectionDAG &DAG) {
26773 MVT VT = Amt.getSimpleValueType();
26774 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
26775 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
26776 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
26777 return SDValue();
26778
26779 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
26780 SmallVector<SDValue, 8> Elts;
26781 MVT SVT = VT.getVectorElementType();
26782 unsigned SVTBits = SVT.getSizeInBits();
26783 APInt One(SVTBits, 1);
26784 unsigned NumElems = VT.getVectorNumElements();
26785
26786 for (unsigned i = 0; i != NumElems; ++i) {
26787 SDValue Op = Amt->getOperand(i);
26788 if (Op->isUndef()) {
26789 Elts.push_back(Op);
26790 continue;
26791 }
26792
26793 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
26794 APInt C(SVTBits, ND->getZExtValue());
26795 uint64_t ShAmt = C.getZExtValue();
26796 if (ShAmt >= SVTBits) {
26797 Elts.push_back(DAG.getUNDEF(SVT));
26798 continue;
26799 }
26800 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
26801 }
26802 return DAG.getBuildVector(VT, dl, Elts);
26803 }
26804
26805 // If the target doesn't support variable shifts, use either FP conversion
26806 // or integer multiplication to avoid shifting each element individually.
26807 if (VT == MVT::v4i32) {
26808 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
26809 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
26810 DAG.getConstant(0x3f800000U, dl, VT));
26811 Amt = DAG.getBitcast(MVT::v4f32, Amt);
26812 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
26813 }
26814
26815 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
26816 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
26817 SDValue Z = DAG.getConstant(0, dl, VT);
26818 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
26819 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
26820 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
26821 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
26822 if (Subtarget.hasSSE41())
26823 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
26824
26825 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
26826 DAG.getBitcast(VT, Hi),
26827 {0, 2, 4, 6, 8, 10, 12, 14});
26828 }
26829
26830 return SDValue();
26831}
26832
26833static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
26834 SelectionDAG &DAG) {
26835 MVT VT = Op.getSimpleValueType();
26836 SDLoc dl(Op);
26837 SDValue R = Op.getOperand(0);
26838 SDValue Amt = Op.getOperand(1);
26839 unsigned EltSizeInBits = VT.getScalarSizeInBits();
26840 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
26841
26842 unsigned Opc = Op.getOpcode();
26843 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
26844 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
26845
26846 assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26846, __PRETTY_FUNCTION__))
;
26847 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26847, __PRETTY_FUNCTION__))
;
26848
26849 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
26850 return V;
26851
26852 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
26853 return V;
26854
26855 if (SupportedVectorVarShift(VT, Subtarget, Opc))
26856 return Op;
26857
26858 // XOP has 128-bit variable logical/arithmetic shifts.
26859 // +ve/-ve Amt = shift left/right.
26860 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
26861 VT == MVT::v8i16 || VT == MVT::v16i8)) {
26862 if (Opc == ISD::SRL || Opc == ISD::SRA) {
26863 SDValue Zero = DAG.getConstant(0, dl, VT);
26864 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
26865 }
26866 if (Opc == ISD::SHL || Opc == ISD::SRL)
26867 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
26868 if (Opc == ISD::SRA)
26869 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
26870 }
26871
26872 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
26873 // shifts per-lane and then shuffle the partial results back together.
26874 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
26875 // Splat the shift amounts so the scalar shifts above will catch it.
26876 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
26877 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
26878 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
26879 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
26880 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
26881 }
26882
26883 // i64 vector arithmetic shift can be emulated with the transform:
26884 // M = lshr(SIGN_MASK, Amt)
26885 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
26886 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
26887 Opc == ISD::SRA) {
26888 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
26889 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
26890 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
26891 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
26892 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
26893 return R;
26894 }
26895
26896 // If possible, lower this shift as a sequence of two shifts by
26897 // constant plus a BLENDing shuffle instead of scalarizing it.
26898 // Example:
26899 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
26900 //
26901 // Could be rewritten as:
26902 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
26903 //
26904 // The advantage is that the two shifts from the example would be
26905 // lowered as X86ISD::VSRLI nodes in parallel before blending.
26906 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
26907 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
26908 SDValue Amt1, Amt2;
26909 unsigned NumElts = VT.getVectorNumElements();
26910 SmallVector<int, 8> ShuffleMask;
26911 for (unsigned i = 0; i != NumElts; ++i) {
26912 SDValue A = Amt->getOperand(i);
26913 if (A.isUndef()) {
26914 ShuffleMask.push_back(SM_SentinelUndef);
26915 continue;
26916 }
26917 if (!Amt1 || Amt1 == A) {
26918 ShuffleMask.push_back(i);
26919 Amt1 = A;
26920 continue;
26921 }
26922 if (!Amt2 || Amt2 == A) {
26923 ShuffleMask.push_back(i + NumElts);
26924 Amt2 = A;
26925 continue;
26926 }
26927 break;
26928 }
26929
26930 // Only perform this blend if we can perform it without loading a mask.
26931 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
26932 (VT != MVT::v16i16 ||
26933 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
26934 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
26935 canWidenShuffleElements(ShuffleMask))) {
26936 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
26937 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
26938 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
26939 Cst2->getAPIntValue().ult(EltSizeInBits)) {
26940 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
26941 Cst1->getZExtValue(), DAG);
26942 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
26943 Cst2->getZExtValue(), DAG);
26944 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
26945 }
26946 }
26947 }
26948
26949 // If possible, lower this packed shift into a vector multiply instead of
26950 // expanding it into a sequence of scalar shifts.
26951 if (Opc == ISD::SHL)
26952 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
26953 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
26954
26955 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
26956 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
26957 if (Opc == ISD::SRL && ConstantAmt &&
26958 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
26959 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
26960 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
26961 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
26962 SDValue Zero = DAG.getConstant(0, dl, VT);
26963 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
26964 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
26965 return DAG.getSelect(dl, VT, ZAmt, R, Res);
26966 }
26967 }
26968
26969 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
26970 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
26971 // TODO: Special case handling for shift by 0/1, really we can afford either
26972 // of these cases in pre-SSE41/XOP/AVX512 but not both.
26973 if (Opc == ISD::SRA && ConstantAmt &&
26974 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
26975 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
26976 !Subtarget.hasAVX512()) ||
26977 DAG.isKnownNeverZero(Amt))) {
26978 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
26979 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
26980 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
26981 SDValue Amt0 =
26982 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
26983 SDValue Amt1 =
26984 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
26985 SDValue Sra1 =
26986 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
26987 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
26988 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
26989 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
26990 }
26991 }
26992
26993 // v4i32 Non Uniform Shifts.
26994 // If the shift amount is constant we can shift each lane using the SSE2
26995 // immediate shifts, else we need to zero-extend each lane to the lower i64
26996 // and shift using the SSE2 variable shifts.
26997 // The separate results can then be blended together.
26998 if (VT == MVT::v4i32) {
26999 SDValue Amt0, Amt1, Amt2, Amt3;
27000 if (ConstantAmt) {
27001 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
27002 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
27003 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
27004 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
27005 } else {
27006 // The SSE2 shifts use the lower i64 as the same shift amount for
27007 // all lanes and the upper i64 is ignored. On AVX we're better off
27008 // just zero-extending, but for SSE just duplicating the top 16-bits is
27009 // cheaper and has the same effect for out of range values.
27010 if (Subtarget.hasAVX()) {
27011 SDValue Z = DAG.getConstant(0, dl, VT);
27012 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
27013 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
27014 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
27015 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
27016 } else {
27017 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
27018 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27019 {4, 5, 6, 7, -1, -1, -1, -1});
27020 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27021 {0, 1, 1, 1, -1, -1, -1, -1});
27022 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27023 {2, 3, 3, 3, -1, -1, -1, -1});
27024 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27025 {0, 1, 1, 1, -1, -1, -1, -1});
27026 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27027 {2, 3, 3, 3, -1, -1, -1, -1});
27028 }
27029 }
27030
27031 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
27032 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
27033 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
27034 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
27035 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
27036
27037 // Merge the shifted lane results optimally with/without PBLENDW.
27038 // TODO - ideally shuffle combining would handle this.
27039 if (Subtarget.hasSSE41()) {
27040 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
27041 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
27042 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
27043 }
27044 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
27045 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
27046 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
27047 }
27048
27049 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
27050 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
27051 // make the existing SSE solution better.
27052 // NOTE: We honor prefered vector width before promoting to 512-bits.
27053 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
27054 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
27055 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
27056 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
27057 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
27058 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27059, __PRETTY_FUNCTION__))
27059 "Unexpected vector type")(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27059, __PRETTY_FUNCTION__))
;
27060 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
27061 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
27062 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27063 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
27064 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
27065 return DAG.getNode(ISD::TRUNCATE, dl, VT,
27066 DAG.getNode(Opc, dl, ExtVT, R, Amt));
27067 }
27068
27069 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
27070 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
27071 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
27072 (VT == MVT::v16i8 || VT == MVT::v64i8 ||
27073 (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
27074 !Subtarget.hasXOP()) {
27075 int NumElts = VT.getVectorNumElements();
27076 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
27077
27078 // Extend constant shift amount to vXi16 (it doesn't matter if the type
27079 // isn't legal).
27080 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27081 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
27082 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
27083 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
27084 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27085, __PRETTY_FUNCTION__))
27085 "Constant build vector expected")((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27085, __PRETTY_FUNCTION__))
;
27086
27087 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
27088 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
27089 : DAG.getZExtOrTrunc(R, dl, ExVT);
27090 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
27091 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
27092 return DAG.getZExtOrTrunc(R, dl, VT);
27093 }
27094
27095 SmallVector<SDValue, 16> LoAmt, HiAmt;
27096 for (int i = 0; i != NumElts; i += 16) {
27097 for (int j = 0; j != 8; ++j) {
27098 LoAmt.push_back(Amt.getOperand(i + j));
27099 HiAmt.push_back(Amt.getOperand(i + j + 8));
27100 }
27101 }
27102
27103 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
27104 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
27105 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
27106
27107 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
27108 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
27109 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
27110 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
27111 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
27112 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
27113 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
27114 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
27115 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
27116 }
27117
27118 if (VT == MVT::v16i8 ||
27119 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
27120 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
27121 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
27122
27123 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
27124 if (VT.is512BitVector()) {
27125 // On AVX512BW targets we make use of the fact that VSELECT lowers
27126 // to a masked blend which selects bytes based just on the sign bit
27127 // extracted to a mask.
27128 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27129 V0 = DAG.getBitcast(VT, V0);
27130 V1 = DAG.getBitcast(VT, V1);
27131 Sel = DAG.getBitcast(VT, Sel);
27132 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
27133 ISD::SETGT);
27134 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
27135 } else if (Subtarget.hasSSE41()) {
27136 // On SSE41 targets we make use of the fact that VSELECT lowers
27137 // to PBLENDVB which selects bytes based just on the sign bit.
27138 V0 = DAG.getBitcast(VT, V0);
27139 V1 = DAG.getBitcast(VT, V1);
27140 Sel = DAG.getBitcast(VT, Sel);
27141 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
27142 }
27143 // On pre-SSE41 targets we test for the sign bit by comparing to
27144 // zero - a negative value will set all bits of the lanes to true
27145 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
27146 SDValue Z = DAG.getConstant(0, dl, SelVT);
27147 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
27148 return DAG.getSelect(dl, SelVT, C, V0, V1);
27149 };
27150
27151 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
27152 // We can safely do this using i16 shifts as we're only interested in
27153 // the 3 lower bits of each byte.
27154 Amt = DAG.getBitcast(ExtVT, Amt);
27155 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
27156 Amt = DAG.getBitcast(VT, Amt);
27157
27158 if (Opc == ISD::SHL || Opc == ISD::SRL) {
27159 // r = VSELECT(r, shift(r, 4), a);
27160 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
27161 R = SignBitSelect(VT, Amt, M, R);
27162
27163 // a += a
27164 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27165
27166 // r = VSELECT(r, shift(r, 2), a);
27167 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
27168 R = SignBitSelect(VT, Amt, M, R);
27169
27170 // a += a
27171 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27172
27173 // return VSELECT(r, shift(r, 1), a);
27174 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
27175 R = SignBitSelect(VT, Amt, M, R);
27176 return R;
27177 }
27178
27179 if (Opc == ISD::SRA) {
27180 // For SRA we need to unpack each byte to the higher byte of a i16 vector
27181 // so we can correctly sign extend. We don't care what happens to the
27182 // lower byte.
27183 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
27184 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
27185 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
27186 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
27187 ALo = DAG.getBitcast(ExtVT, ALo);
27188 AHi = DAG.getBitcast(ExtVT, AHi);
27189 RLo = DAG.getBitcast(ExtVT, RLo);
27190 RHi = DAG.getBitcast(ExtVT, RHi);
27191
27192 // r = VSELECT(r, shift(r, 4), a);
27193 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
27194 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
27195 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27196 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27197
27198 // a += a
27199 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
27200 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
27201
27202 // r = VSELECT(r, shift(r, 2), a);
27203 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
27204 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
27205 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27206 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27207
27208 // a += a
27209 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
27210 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
27211
27212 // r = VSELECT(r, shift(r, 1), a);
27213 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
27214 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
27215 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27216 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27217
27218 // Logical shift the result back to the lower byte, leaving a zero upper
27219 // byte meaning that we can safely pack with PACKUSWB.
27220 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
27221 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
27222 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27223 }
27224 }
27225
27226 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
27227 MVT ExtVT = MVT::v8i32;
27228 SDValue Z = DAG.getConstant(0, dl, VT);
27229 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
27230 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
27231 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
27232 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
27233 ALo = DAG.getBitcast(ExtVT, ALo);
27234 AHi = DAG.getBitcast(ExtVT, AHi);
27235 RLo = DAG.getBitcast(ExtVT, RLo);
27236 RHi = DAG.getBitcast(ExtVT, RHi);
27237 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
27238 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
27239 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
27240 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
27241 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
27242 }
27243
27244 if (VT == MVT::v8i16) {
27245 // If we have a constant shift amount, the non-SSE41 path is best as
27246 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
27247 bool UseSSE41 = Subtarget.hasSSE41() &&
27248 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
27249
27250 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
27251 // On SSE41 targets we make use of the fact that VSELECT lowers
27252 // to PBLENDVB which selects bytes based just on the sign bit.
27253 if (UseSSE41) {
27254 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
27255 V0 = DAG.getBitcast(ExtVT, V0);
27256 V1 = DAG.getBitcast(ExtVT, V1);
27257 Sel = DAG.getBitcast(ExtVT, Sel);
27258 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
27259 }
27260 // On pre-SSE41 targets we splat the sign bit - a negative value will
27261 // set all bits of the lanes to true and VSELECT uses that in
27262 // its OR(AND(V0,C),AND(V1,~C)) lowering.
27263 SDValue C =
27264 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
27265 return DAG.getSelect(dl, VT, C, V0, V1);
27266 };
27267
27268 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
27269 if (UseSSE41) {
27270 // On SSE41 targets we need to replicate the shift mask in both
27271 // bytes for PBLENDVB.
27272 Amt = DAG.getNode(
27273 ISD::OR, dl, VT,
27274 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
27275 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
27276 } else {
27277 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
27278 }
27279
27280 // r = VSELECT(r, shift(r, 8), a);
27281 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
27282 R = SignBitSelect(Amt, M, R);
27283
27284 // a += a
27285 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27286
27287 // r = VSELECT(r, shift(r, 4), a);
27288 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
27289 R = SignBitSelect(Amt, M, R);
27290
27291 // a += a
27292 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27293
27294 // r = VSELECT(r, shift(r, 2), a);
27295 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
27296 R = SignBitSelect(Amt, M, R);
27297
27298 // a += a
27299 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27300
27301 // return VSELECT(r, shift(r, 1), a);
27302 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
27303 R = SignBitSelect(Amt, M, R);
27304 return R;
27305 }
27306
27307 // Decompose 256-bit shifts into 128-bit shifts.
27308 if (VT.is256BitVector())
27309 return split256IntArith(Op, DAG);
27310
27311 return SDValue();
27312}
27313
27314static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
27315 SelectionDAG &DAG) {
27316 MVT VT = Op.getSimpleValueType();
27317 assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27317, __PRETTY_FUNCTION__))
;
27318
27319 SDLoc DL(Op);
27320 SDValue R = Op.getOperand(0);
27321 SDValue Amt = Op.getOperand(1);
27322 unsigned Opcode = Op.getOpcode();
27323 unsigned EltSizeInBits = VT.getScalarSizeInBits();
27324 int NumElts = VT.getVectorNumElements();
27325
27326 // Check for constant splat rotation amount.
27327 APInt UndefElts;
27328 SmallVector<APInt, 32> EltBits;
27329 int CstSplatIndex = -1;
27330 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
27331 for (int i = 0; i != NumElts; ++i)
27332 if (!UndefElts[i]) {
27333 if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
27334 CstSplatIndex = i;
27335 continue;
27336 }
27337 CstSplatIndex = -1;
27338 break;
27339 }
27340
27341 // Check for splat rotate by zero.
27342 if (0 <= CstSplatIndex && EltBits[CstSplatIndex].urem(EltSizeInBits) == 0)
27343 return R;
27344
27345 // AVX512 implicitly uses modulo rotation amounts.
27346 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
27347 // Attempt to rotate by immediate.
27348 if (0 <= CstSplatIndex) {
27349 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
27350 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
27351 return DAG.getNode(Op, DL, VT, R,
27352 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
27353 }
27354
27355 // Else, fall-back on VPROLV/VPRORV.
27356 return Op;
27357 }
27358
27359 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(((Opcode == ISD::ROTL) && "Only ROTL supported") ? static_cast
<void> (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27359, __PRETTY_FUNCTION__))
;
27360
27361 // XOP has 128-bit vector variable + immediate rotates.
27362 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
27363 // XOP implicitly uses modulo rotation amounts.
27364 if (Subtarget.hasXOP()) {
27365 if (VT.is256BitVector())
27366 return split256IntArith(Op, DAG);
27367 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27367, __PRETTY_FUNCTION__))
;
27368
27369 // Attempt to rotate by immediate.
27370 if (0 <= CstSplatIndex) {
27371 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
27372 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
27373 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
27374 }
27375
27376 // Use general rotate by variable (per-element).
27377 return Op;
27378 }
27379
27380 // Split 256-bit integers on pre-AVX2 targets.
27381 if (VT.is256BitVector() && !Subtarget.hasAVX2())
27382 return split256IntArith(Op, DAG);
27383
27384 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27387, __PRETTY_FUNCTION__))
27385 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27387, __PRETTY_FUNCTION__))
27386 Subtarget.hasAVX2())) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27387, __PRETTY_FUNCTION__))
27387 "Only vXi32/vXi16/vXi8 vector rotates supported")(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27387, __PRETTY_FUNCTION__))
;
27388
27389 // Rotate by an uniform constant - expand back to shifts.
27390 if (0 <= CstSplatIndex)
27391 return SDValue();
27392
27393 bool IsSplatAmt = DAG.isSplatValue(Amt);
27394
27395 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
27396 // the amount bit.
27397 if (EltSizeInBits == 8 && !IsSplatAmt) {
27398 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
27399 return SDValue();
27400
27401 // We don't need ModuloAmt here as we just peek at individual bits.
27402 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27403
27404 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
27405 if (Subtarget.hasSSE41()) {
27406 // On SSE41 targets we make use of the fact that VSELECT lowers
27407 // to PBLENDVB which selects bytes based just on the sign bit.
27408 V0 = DAG.getBitcast(VT, V0);
27409 V1 = DAG.getBitcast(VT, V1);
27410 Sel = DAG.getBitcast(VT, Sel);
27411 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
27412 }
27413 // On pre-SSE41 targets we test for the sign bit by comparing to
27414 // zero - a negative value will set all bits of the lanes to true
27415 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
27416 SDValue Z = DAG.getConstant(0, DL, SelVT);
27417 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
27418 return DAG.getSelect(DL, SelVT, C, V0, V1);
27419 };
27420
27421 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
27422 // We can safely do this using i16 shifts as we're only interested in
27423 // the 3 lower bits of each byte.
27424 Amt = DAG.getBitcast(ExtVT, Amt);
27425 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
27426 Amt = DAG.getBitcast(VT, Amt);
27427
27428 // r = VSELECT(r, rot(r, 4), a);
27429 SDValue M;
27430 M = DAG.getNode(
27431 ISD::OR, DL, VT,
27432 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
27433 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
27434 R = SignBitSelect(VT, Amt, M, R);
27435
27436 // a += a
27437 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
27438
27439 // r = VSELECT(r, rot(r, 2), a);
27440 M = DAG.getNode(
27441 ISD::OR, DL, VT,
27442 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
27443 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
27444 R = SignBitSelect(VT, Amt, M, R);
27445
27446 // a += a
27447 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
27448
27449 // return VSELECT(r, rot(r, 1), a);
27450 M = DAG.getNode(
27451 ISD::OR, DL, VT,
27452 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
27453 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
27454 return SignBitSelect(VT, Amt, M, R);
27455 }
27456
27457 // ISD::ROT* uses modulo rotate amounts.
27458 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
27459 DAG.getConstant(EltSizeInBits - 1, DL, VT));
27460
27461 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
27462 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
27463 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
27464
27465 // Fallback for splats + all supported variable shifts.
27466 // Fallback for non-constants AVX2 vXi16 as well.
27467 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
27468 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
27469 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
27470 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
27471 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
27472 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
27473 }
27474
27475 // As with shifts, convert the rotation amount to a multiplication factor.
27476 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
27477 assert(Scale && "Failed to convert ROTL amount to scale")((Scale && "Failed to convert ROTL amount to scale") ?
static_cast<void> (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27477, __PRETTY_FUNCTION__))
;
27478
27479 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
27480 if (EltSizeInBits == 16) {
27481 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
27482 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
27483 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
27484 }
27485
27486 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
27487 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
27488 // that can then be OR'd with the lower 32-bits.
27489 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27489, __PRETTY_FUNCTION__))
;
27490 static const int OddMask[] = {1, -1, 3, -1};
27491 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
27492 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
27493
27494 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
27495 DAG.getBitcast(MVT::v2i64, R),
27496 DAG.getBitcast(MVT::v2i64, Scale));
27497 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
27498 DAG.getBitcast(MVT::v2i64, R13),
27499 DAG.getBitcast(MVT::v2i64, Scale13));
27500 Res02 = DAG.getBitcast(VT, Res02);
27501 Res13 = DAG.getBitcast(VT, Res13);
27502
27503 return DAG.getNode(ISD::OR, DL, VT,
27504 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
27505 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
27506}
27507
27508/// Returns true if the operand type is exactly twice the native width, and
27509/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
27510/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
27511/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
27512bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
27513 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
27514
27515 if (OpWidth == 64)
27516 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
27517 if (OpWidth == 128)
27518 return Subtarget.hasCmpxchg16b();
27519
27520 return false;
27521}
27522
27523bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
27524 Type *MemType = SI->getValueOperand()->getType();
27525
27526 bool NoImplicitFloatOps =
27527 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
27528 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
27529 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
27530 (Subtarget.hasSSE1() || Subtarget.hasX87()))
27531 return false;
27532
27533 return needsCmpXchgNb(MemType);
27534}
27535
27536// Note: this turns large loads into lock cmpxchg8b/16b.
27537// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
27538TargetLowering::AtomicExpansionKind
27539X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
27540 Type *MemType = LI->getType();
27541
27542 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
27543 // can use movq to do the load. If we have X87 we can load into an 80-bit
27544 // X87 register and store it to a stack temporary.
27545 bool NoImplicitFloatOps =
27546 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
27547 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
27548 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
27549 (Subtarget.hasSSE1() || Subtarget.hasX87()))
27550 return AtomicExpansionKind::None;
27551
27552 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
27553 : AtomicExpansionKind::None;
27554}
27555
27556TargetLowering::AtomicExpansionKind
27557X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27558 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
27559 Type *MemType = AI->getType();
27560
27561 // If the operand is too big, we must see if cmpxchg8/16b is available
27562 // and default to library calls otherwise.
27563 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
27564 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
27565 : AtomicExpansionKind::None;
27566 }
27567
27568 AtomicRMWInst::BinOp Op = AI->getOperation();
27569 switch (Op) {
27570 default:
27571 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27571)
;
27572 case AtomicRMWInst::Xchg:
27573 case AtomicRMWInst::Add:
27574 case AtomicRMWInst::Sub:
27575 // It's better to use xadd, xsub or xchg for these in all cases.
27576 return AtomicExpansionKind::None;
27577 case AtomicRMWInst::Or:
27578 case AtomicRMWInst::And:
27579 case AtomicRMWInst::Xor:
27580 // If the atomicrmw's result isn't actually used, we can just add a "lock"
27581 // prefix to a normal instruction for these operations.
27582 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
27583 : AtomicExpansionKind::None;
27584 case AtomicRMWInst::Nand:
27585 case AtomicRMWInst::Max:
27586 case AtomicRMWInst::Min:
27587 case AtomicRMWInst::UMax:
27588 case AtomicRMWInst::UMin:
27589 case AtomicRMWInst::FAdd:
27590 case AtomicRMWInst::FSub:
27591 // These always require a non-trivial set of data operations on x86. We must
27592 // use a cmpxchg loop.
27593 return AtomicExpansionKind::CmpXChg;
27594 }
27595}
27596
27597LoadInst *
27598X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
27599 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
27600 Type *MemType = AI->getType();
27601 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
27602 // there is no benefit in turning such RMWs into loads, and it is actually
27603 // harmful as it introduces a mfence.
27604 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
27605 return nullptr;
27606
27607 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
27608 // lowering available in lowerAtomicArith.
27609 // TODO: push more cases through this path.
27610 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
27611 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
27612 AI->use_empty())
27613 return nullptr;
27614
27615 IRBuilder<> Builder(AI);
27616 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27617 auto SSID = AI->getSyncScopeID();
27618 // We must restrict the ordering to avoid generating loads with Release or
27619 // ReleaseAcquire orderings.
27620 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
27621
27622 // Before the load we need a fence. Here is an example lifted from
27623 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
27624 // is required:
27625 // Thread 0:
27626 // x.store(1, relaxed);
27627 // r1 = y.fetch_add(0, release);
27628 // Thread 1:
27629 // y.fetch_add(42, acquire);
27630 // r2 = x.load(relaxed);
27631 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
27632 // lowered to just a load without a fence. A mfence flushes the store buffer,
27633 // making the optimization clearly correct.
27634 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
27635 // otherwise, we might be able to be more aggressive on relaxed idempotent
27636 // rmw. In practice, they do not look useful, so we don't try to be
27637 // especially clever.
27638 if (SSID == SyncScope::SingleThread)
27639 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
27640 // the IR level, so we must wrap it in an intrinsic.
27641 return nullptr;
27642
27643 if (!Subtarget.hasMFence())
27644 // FIXME: it might make sense to use a locked operation here but on a
27645 // different cache-line to prevent cache-line bouncing. In practice it
27646 // is probably a small win, and x86 processors without mfence are rare
27647 // enough that we do not bother.
27648 return nullptr;
27649
27650 Function *MFence =
27651 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
27652 Builder.CreateCall(MFence, {});
27653
27654 // Finally we can emit the atomic load.
27655 LoadInst *Loaded =
27656 Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
27657 Align(AI->getType()->getPrimitiveSizeInBits()));
27658 Loaded->setAtomic(Order, SSID);
27659 AI->replaceAllUsesWith(Loaded);
27660 AI->eraseFromParent();
27661 return Loaded;
27662}
27663
27664bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
27665 if (!SI.isUnordered())
27666 return false;
27667 return ExperimentalUnorderedISEL;
27668}
27669bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
27670 if (!LI.isUnordered())
27671 return false;
27672 return ExperimentalUnorderedISEL;
27673}
27674
27675
27676/// Emit a locked operation on a stack location which does not change any
27677/// memory location, but does involve a lock prefix. Location is chosen to be
27678/// a) very likely accessed only by a single thread to minimize cache traffic,
27679/// and b) definitely dereferenceable. Returns the new Chain result.
27680static SDValue emitLockedStackOp(SelectionDAG &DAG,
27681 const X86Subtarget &Subtarget,
27682 SDValue Chain, SDLoc DL) {
27683 // Implementation notes:
27684 // 1) LOCK prefix creates a full read/write reordering barrier for memory
27685 // operations issued by the current processor. As such, the location
27686 // referenced is not relevant for the ordering properties of the instruction.
27687 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
27688 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
27689 // 2) Using an immediate operand appears to be the best encoding choice
27690 // here since it doesn't require an extra register.
27691 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
27692 // is small enough it might just be measurement noise.)
27693 // 4) When choosing offsets, there are several contributing factors:
27694 // a) If there's no redzone, we default to TOS. (We could allocate a cache
27695 // line aligned stack object to improve this case.)
27696 // b) To minimize our chances of introducing a false dependence, we prefer
27697 // to offset the stack usage from TOS slightly.
27698 // c) To minimize concerns about cross thread stack usage - in particular,
27699 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
27700 // captures state in the TOS frame and accesses it from many threads -
27701 // we want to use an offset such that the offset is in a distinct cache
27702 // line from the TOS frame.
27703 //
27704 // For a general discussion of the tradeoffs and benchmark results, see:
27705 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
27706
27707 auto &MF = DAG.getMachineFunction();
27708 auto &TFL = *Subtarget.getFrameLowering();
27709 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
27710
27711 if (Subtarget.is64Bit()) {
27712 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
27713 SDValue Ops[] = {
27714 DAG.getRegister(X86::RSP, MVT::i64), // Base
27715 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27716 DAG.getRegister(0, MVT::i64), // Index
27717 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
27718 DAG.getRegister(0, MVT::i16), // Segment.
27719 Zero,
27720 Chain};
27721 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
27722 MVT::Other, Ops);
27723 return SDValue(Res, 1);
27724 }
27725
27726 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
27727 SDValue Ops[] = {
27728 DAG.getRegister(X86::ESP, MVT::i32), // Base
27729 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27730 DAG.getRegister(0, MVT::i32), // Index
27731 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
27732 DAG.getRegister(0, MVT::i16), // Segment.
27733 Zero,
27734 Chain
27735 };
27736 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
27737 MVT::Other, Ops);
27738 return SDValue(Res, 1);
27739}
27740
27741static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
27742 SelectionDAG &DAG) {
27743 SDLoc dl(Op);
27744 AtomicOrdering FenceOrdering =
27745 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
27746 SyncScope::ID FenceSSID =
27747 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
27748
27749 // The only fence that needs an instruction is a sequentially-consistent
27750 // cross-thread fence.
27751 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
27752 FenceSSID == SyncScope::System) {
27753 if (Subtarget.hasMFence())
27754 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
27755
27756 SDValue Chain = Op.getOperand(0);
27757 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
27758 }
27759
27760 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
27761 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
27762}
27763
27764static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
27765 SelectionDAG &DAG) {
27766 MVT T = Op.getSimpleValueType();
27767 SDLoc DL(Op);
27768 unsigned Reg = 0;
27769 unsigned size = 0;
27770 switch(T.SimpleTy) {
27771 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27771)
;
27772 case MVT::i8: Reg = X86::AL; size = 1; break;
27773 case MVT::i16: Reg = X86::AX; size = 2; break;
27774 case MVT::i32: Reg = X86::EAX; size = 4; break;
27775 case MVT::i64:
27776 assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27776, __PRETTY_FUNCTION__))
;
27777 Reg = X86::RAX; size = 8;
27778 break;
27779 }
27780 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
27781 Op.getOperand(2), SDValue());
27782 SDValue Ops[] = { cpIn.getValue(0),
27783 Op.getOperand(1),
27784 Op.getOperand(3),
27785 DAG.getTargetConstant(size, DL, MVT::i8),
27786 cpIn.getValue(1) };
27787 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27788 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
27789 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
27790 Ops, T, MMO);
27791
27792 SDValue cpOut =
27793 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
27794 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
27795 MVT::i32, cpOut.getValue(2));
27796 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
27797
27798 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27799 cpOut, Success, EFLAGS.getValue(1));
27800}
27801
27802// Create MOVMSKB, taking into account whether we need to split for AVX1.
27803static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
27804 const X86Subtarget &Subtarget) {
27805 MVT InVT = V.getSimpleValueType();
27806
27807 if (InVT == MVT::v64i8) {
27808 SDValue Lo, Hi;
27809 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
27810 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
27811 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
27812 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
27813 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
27814 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
27815 DAG.getConstant(32, DL, MVT::i8));
27816 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
27817 }
27818 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
27819 SDValue Lo, Hi;
27820 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
27821 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
27822 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
27823 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
27824 DAG.getConstant(16, DL, MVT::i8));
27825 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
27826 }
27827
27828 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
27829}
27830
27831static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
27832 SelectionDAG &DAG) {
27833 SDValue Src = Op.getOperand(0);
27834 MVT SrcVT = Src.getSimpleValueType();
27835 MVT DstVT = Op.getSimpleValueType();
27836
27837 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
27838 // half to v32i1 and concatenating the result.
27839 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
27840 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27840, __PRETTY_FUNCTION__))
;
27841 assert(Subtarget.hasBWI() && "Expected BWI target")((Subtarget.hasBWI() && "Expected BWI target") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27841, __PRETTY_FUNCTION__))
;
27842 SDLoc dl(Op);
27843 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
27844 DAG.getIntPtrConstant(0, dl));
27845 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27846 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
27847 DAG.getIntPtrConstant(1, dl));
27848 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27849 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27850 }
27851
27852 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
27853 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
27854 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
27855 SDLoc dl(Op);
27856 SDValue Lo, Hi;
27857 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
27858 MVT CastVT = DstVT.getHalfNumVectorElementsVT();
27859 Lo = DAG.getBitcast(CastVT, Lo);
27860 Hi = DAG.getBitcast(CastVT, Hi);
27861 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
27862 }
27863
27864 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
27865 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
27866 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27866, __PRETTY_FUNCTION__))
;
27867 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
27868 SDLoc DL(Op);
27869 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
27870 V = getPMOVMSKB(DL, V, DAG, Subtarget);
27871 return DAG.getZExtOrTrunc(V, DL, DstVT);
27872 }
27873
27874 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27875, __PRETTY_FUNCTION__))
27875 SrcVT == MVT::i64) && "Unexpected VT!")(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27875, __PRETTY_FUNCTION__))
;
27876
27877 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27877, __PRETTY_FUNCTION__))
;
27878 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
27879 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
27880 // This conversion needs to be expanded.
27881 return SDValue();
27882
27883 SDLoc dl(Op);
27884 if (SrcVT.isVector()) {
27885 // Widen the vector in input in the case of MVT::v2i32.
27886 // Example: from MVT::v2i32 to MVT::v4i32.
27887 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
27888 SrcVT.getVectorNumElements() * 2);
27889 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
27890 DAG.getUNDEF(SrcVT));
27891 } else {
27892 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27893, __PRETTY_FUNCTION__))
27893 "Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27893, __PRETTY_FUNCTION__))
;
27894 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
27895 }
27896
27897 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
27898 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
27899
27900 if (DstVT == MVT::x86mmx)
27901 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
27902
27903 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
27904 DAG.getIntPtrConstant(0, dl));
27905}
27906
27907/// Compute the horizontal sum of bytes in V for the elements of VT.
27908///
27909/// Requires V to be a byte vector and VT to be an integer vector type with
27910/// wider elements than V's type. The width of the elements of VT determines
27911/// how many bytes of V are summed horizontally to produce each element of the
27912/// result.
27913static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
27914 const X86Subtarget &Subtarget,
27915 SelectionDAG &DAG) {
27916 SDLoc DL(V);
27917 MVT ByteVecVT = V.getSimpleValueType();
27918 MVT EltVT = VT.getVectorElementType();
27919 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27920, __PRETTY_FUNCTION__))
27920 "Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27920, __PRETTY_FUNCTION__))
;
27921 assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27922, __PRETTY_FUNCTION__))
27922 "Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27922, __PRETTY_FUNCTION__))
;
27923 unsigned VecSize = VT.getSizeInBits();
27924 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27924, __PRETTY_FUNCTION__))
;
27925
27926 // PSADBW instruction horizontally add all bytes and leave the result in i64
27927 // chunks, thus directly computes the pop count for v2i64 and v4i64.
27928 if (EltVT == MVT::i64) {
27929 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
27930 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
27931 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
27932 return DAG.getBitcast(VT, V);
27933 }
27934
27935 if (EltVT == MVT::i32) {
27936 // We unpack the low half and high half into i32s interleaved with zeros so
27937 // that we can use PSADBW to horizontally sum them. The most useful part of
27938 // this is that it lines up the results of two PSADBW instructions to be
27939 // two v2i64 vectors which concatenated are the 4 population counts. We can
27940 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
27941 SDValue Zeros = DAG.getConstant(0, DL, VT);
27942 SDValue V32 = DAG.getBitcast(VT, V);
27943 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
27944 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
27945
27946 // Do the horizontal sums into two v2i64s.
27947 Zeros = DAG.getConstant(0, DL, ByteVecVT);
27948 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
27949 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
27950 DAG.getBitcast(ByteVecVT, Low), Zeros);
27951 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
27952 DAG.getBitcast(ByteVecVT, High), Zeros);
27953
27954 // Merge them together.
27955 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
27956 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
27957 DAG.getBitcast(ShortVecVT, Low),
27958 DAG.getBitcast(ShortVecVT, High));
27959
27960 return DAG.getBitcast(VT, V);
27961 }
27962
27963 // The only element type left is i16.
27964 assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ?
static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27964, __PRETTY_FUNCTION__))
;
27965
27966 // To obtain pop count for each i16 element starting from the pop count for
27967 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
27968 // right by 8. It is important to shift as i16s as i8 vector shift isn't
27969 // directly supported.
27970 SDValue ShifterV = DAG.getConstant(8, DL, VT);
27971 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
27972 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
27973 DAG.getBitcast(ByteVecVT, V));
27974 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
27975}
27976
27977static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
27978 const X86Subtarget &Subtarget,
27979 SelectionDAG &DAG) {
27980 MVT VT = Op.getSimpleValueType();
27981 MVT EltVT = VT.getVectorElementType();
27982 int NumElts = VT.getVectorNumElements();
27983 (void)EltVT;
27984 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27984, __PRETTY_FUNCTION__))
;
27985
27986 // Implement a lookup table in register by using an algorithm based on:
27987 // http://wm.ite.pl/articles/sse-popcount.html
27988 //
27989 // The general idea is that every lower byte nibble in the input vector is an
27990 // index into a in-register pre-computed pop count table. We then split up the
27991 // input vector in two new ones: (1) a vector with only the shifted-right
27992 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
27993 // masked out higher ones) for each byte. PSHUFB is used separately with both
27994 // to index the in-register table. Next, both are added and the result is a
27995 // i8 vector where each element contains the pop count for input byte.
27996 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
27997 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
27998 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
27999 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
28000
28001 SmallVector<SDValue, 64> LUTVec;
28002 for (int i = 0; i < NumElts; ++i)
28003 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28004 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
28005 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
28006
28007 // High nibbles
28008 SDValue FourV = DAG.getConstant(4, DL, VT);
28009 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
28010
28011 // Low nibbles
28012 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
28013
28014 // The input vector is used as the shuffle mask that index elements into the
28015 // LUT. After counting low and high nibbles, add the vector to obtain the
28016 // final pop count per i8 element.
28017 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
28018 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
28019 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
28020}
28021
28022// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
28023// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
28024static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28025 SelectionDAG &DAG) {
28026 MVT VT = Op.getSimpleValueType();
28027 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28028, __PRETTY_FUNCTION__))
28028 "Unknown CTPOP type to handle")(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28028, __PRETTY_FUNCTION__))
;
28029 SDLoc DL(Op.getNode());
28030 SDValue Op0 = Op.getOperand(0);
28031
28032 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
28033 if (Subtarget.hasVPOPCNTDQ()) {
28034 unsigned NumElems = VT.getVectorNumElements();
28035 assert((VT.getVectorElementType() == MVT::i8 ||(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28036, __PRETTY_FUNCTION__))
28036 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28036, __PRETTY_FUNCTION__))
;
28037 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
28038 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28039 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
28040 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
28041 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
28042 }
28043 }
28044
28045 // Decompose 256-bit ops into smaller 128-bit ops.
28046 if (VT.is256BitVector() && !Subtarget.hasInt256())
28047 return Lower256IntUnary(Op, DAG);
28048
28049 // Decompose 512-bit ops into smaller 256-bit ops.
28050 if (VT.is512BitVector() && !Subtarget.hasBWI())
28051 return Lower512IntUnary(Op, DAG);
28052
28053 // For element types greater than i8, do vXi8 pop counts and a bytesum.
28054 if (VT.getScalarType() != MVT::i8) {
28055 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
28056 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
28057 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
28058 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
28059 }
28060
28061 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
28062 if (!Subtarget.hasSSSE3())
28063 return SDValue();
28064
28065 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
28066}
28067
28068static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28069 SelectionDAG &DAG) {
28070 assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28071, __PRETTY_FUNCTION__))
28071 "We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28071, __PRETTY_FUNCTION__))
;
28072 return LowerVectorCTPOP(Op, Subtarget, DAG);
28073}
28074
28075static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
28076 MVT VT = Op.getSimpleValueType();
28077 SDValue In = Op.getOperand(0);
28078 SDLoc DL(Op);
28079
28080 // For scalars, its still beneficial to transfer to/from the SIMD unit to
28081 // perform the BITREVERSE.
28082 if (!VT.isVector()) {
28083 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
28084 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
28085 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
28086 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
28087 DAG.getIntPtrConstant(0, DL));
28088 }
28089
28090 int NumElts = VT.getVectorNumElements();
28091 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
28092
28093 // Decompose 256-bit ops into smaller 128-bit ops.
28094 if (VT.is256BitVector())
28095 return Lower256IntUnary(Op, DAG);
28096
28097 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28098, __PRETTY_FUNCTION__))
28098 "Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28098, __PRETTY_FUNCTION__))
;
28099
28100 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
28101 // perform the BSWAP in the shuffle.
28102 // Its best to shuffle using the second operand as this will implicitly allow
28103 // memory folding for multiple vectors.
28104 SmallVector<SDValue, 16> MaskElts;
28105 for (int i = 0; i != NumElts; ++i) {
28106 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
28107 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
28108 int PermuteByte = SourceByte | (2 << 5);
28109 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
28110 }
28111 }
28112
28113 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
28114 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
28115 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
28116 Res, Mask);
28117 return DAG.getBitcast(VT, Res);
28118}
28119
28120static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
28121 SelectionDAG &DAG) {
28122 MVT VT = Op.getSimpleValueType();
28123
28124 if (Subtarget.hasXOP() && !VT.is512BitVector())
28125 return LowerBITREVERSE_XOP(Op, DAG);
28126
28127 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28127, __PRETTY_FUNCTION__))
;
28128
28129 SDValue In = Op.getOperand(0);
28130 SDLoc DL(Op);
28131
28132 // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
28133 // lowering.
28134 if (VT == MVT::v8i64 || VT == MVT::v16i32) {
28135 assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE")((!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasBWI() && \"BWI should Expand BITREVERSE\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28135, __PRETTY_FUNCTION__))
;
28136 return Lower512IntUnary(Op, DAG);
28137 }
28138
28139 unsigned NumElts = VT.getVectorNumElements();
28140 assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28141, __PRETTY_FUNCTION__))
28141 "Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28141, __PRETTY_FUNCTION__))
;
28142
28143 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
28144 if (VT.is256BitVector() && !Subtarget.hasInt256())
28145 return Lower256IntUnary(Op, DAG);
28146
28147 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
28148 // two nibbles and a PSHUFB lookup to find the bitreverse of each
28149 // 0-15 value (moved to the other nibble).
28150 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
28151 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
28152 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
28153
28154 const int LoLUT[16] = {
28155 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
28156 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
28157 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
28158 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
28159 const int HiLUT[16] = {
28160 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
28161 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
28162 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
28163 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
28164
28165 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
28166 for (unsigned i = 0; i < NumElts; ++i) {
28167 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
28168 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
28169 }
28170
28171 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
28172 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
28173 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
28174 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
28175 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
28176}
28177
28178static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
28179 const X86Subtarget &Subtarget) {
28180 unsigned NewOpc = 0;
28181 switch (N->getOpcode()) {
28182 case ISD::ATOMIC_LOAD_ADD:
28183 NewOpc = X86ISD::LADD;
28184 break;
28185 case ISD::ATOMIC_LOAD_SUB:
28186 NewOpc = X86ISD::LSUB;
28187 break;
28188 case ISD::ATOMIC_LOAD_OR:
28189 NewOpc = X86ISD::LOR;
28190 break;
28191 case ISD::ATOMIC_LOAD_XOR:
28192 NewOpc = X86ISD::LXOR;
28193 break;
28194 case ISD::ATOMIC_LOAD_AND:
28195 NewOpc = X86ISD::LAND;
28196 break;
28197 default:
28198 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28198)
;
28199 }
28200
28201 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
28202
28203 return DAG.getMemIntrinsicNode(
28204 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
28205 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
28206 /*MemVT=*/N->getSimpleValueType(0), MMO);
28207}
28208
28209/// Lower atomic_load_ops into LOCK-prefixed operations.
28210static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
28211 const X86Subtarget &Subtarget) {
28212 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
28213 SDValue Chain = N->getOperand(0);
28214 SDValue LHS = N->getOperand(1);
28215 SDValue RHS = N->getOperand(2);
28216 unsigned Opc = N->getOpcode();
28217 MVT VT = N->getSimpleValueType(0);
28218 SDLoc DL(N);
28219
28220 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
28221 // can only be lowered when the result is unused. They should have already
28222 // been transformed into a cmpxchg loop in AtomicExpand.
28223 if (N->hasAnyUseOfValue(0)) {
28224 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
28225 // select LXADD if LOCK_SUB can't be selected.
28226 if (Opc == ISD::ATOMIC_LOAD_SUB) {
28227 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
28228 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
28229 RHS, AN->getMemOperand());
28230 }
28231 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28232, __PRETTY_FUNCTION__))
28232 "Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28232, __PRETTY_FUNCTION__))
;
28233 return N;
28234 }
28235
28236 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
28237 // The core idea here is that since the memory location isn't actually
28238 // changing, all we need is a lowering for the *ordering* impacts of the
28239 // atomicrmw. As such, we can chose a different operation and memory
28240 // location to minimize impact on other code.
28241 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
28242 // On X86, the only ordering which actually requires an instruction is
28243 // seq_cst which isn't SingleThread, everything just needs to be preserved
28244 // during codegen and then dropped. Note that we expect (but don't assume),
28245 // that orderings other than seq_cst and acq_rel have been canonicalized to
28246 // a store or load.
28247 if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
28248 AN->getSyncScopeID() == SyncScope::System) {
28249 // Prefer a locked operation against a stack location to minimize cache
28250 // traffic. This assumes that stack locations are very likely to be
28251 // accessed only by the owning thread.
28252 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
28253 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28253, __PRETTY_FUNCTION__))
;
28254 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28255 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28256 DAG.getUNDEF(VT), NewChain);
28257 }
28258 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
28259 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
28260 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28260, __PRETTY_FUNCTION__))
;
28261 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28262 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28263 DAG.getUNDEF(VT), NewChain);
28264 }
28265
28266 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
28267 // RAUW the chain, but don't worry about the result, as it's unused.
28268 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28268, __PRETTY_FUNCTION__))
;
28269 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28270 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28271 DAG.getUNDEF(VT), LockOp.getValue(1));
28272}
28273
28274static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
28275 const X86Subtarget &Subtarget) {
28276 auto *Node = cast<AtomicSDNode>(Op.getNode());
28277 SDLoc dl(Node);
28278 EVT VT = Node->getMemoryVT();
28279
28280 bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
28281 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
28282
28283 // If this store is not sequentially consistent and the type is legal
28284 // we can just keep it.
28285 if (!IsSeqCst && IsTypeLegal)
28286 return Op;
28287
28288 if (VT == MVT::i64 && !IsTypeLegal) {
28289 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
28290 // is enabled.
28291 bool NoImplicitFloatOps =
28292 DAG.getMachineFunction().getFunction().hasFnAttribute(
28293 Attribute::NoImplicitFloat);
28294 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
28295 SDValue Chain;
28296 if (Subtarget.hasSSE1()) {
28297 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
28298 Node->getOperand(2));
28299 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
28300 SclToVec = DAG.getBitcast(StVT, SclToVec);
28301 SDVTList Tys = DAG.getVTList(MVT::Other);
28302 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
28303 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
28304 MVT::i64, Node->getMemOperand());
28305 } else if (Subtarget.hasX87()) {
28306 // First load this into an 80-bit X87 register using a stack temporary.
28307 // This will put the whole integer into the significand.
28308 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
28309 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28310 MachinePointerInfo MPI =
28311 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28312 Chain =
28313 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
28314 MPI, /*Align*/ 0, MachineMemOperand::MOStore);
28315 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
28316 SDValue LdOps[] = {Chain, StackPtr};
28317 SDValue Value =
28318 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
28319 /*Align*/ 0, MachineMemOperand::MOLoad);
28320 Chain = Value.getValue(1);
28321
28322 // Now use an FIST to do the atomic store.
28323 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
28324 Chain =
28325 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
28326 StoreOps, MVT::i64, Node->getMemOperand());
28327 }
28328
28329 if (Chain) {
28330 // If this is a sequentially consistent store, also emit an appropriate
28331 // barrier.
28332 if (IsSeqCst)
28333 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
28334
28335 return Chain;
28336 }
28337 }
28338 }
28339
28340 // Convert seq_cst store -> xchg
28341 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
28342 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
28343 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
28344 Node->getMemoryVT(),
28345 Node->getOperand(0),
28346 Node->getOperand(1), Node->getOperand(2),
28347 Node->getMemOperand());
28348 return Swap.getValue(1);
28349}
28350
28351static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
28352 SDNode *N = Op.getNode();
28353 MVT VT = N->getSimpleValueType(0);
28354
28355 // Let legalize expand this if it isn't a legal type yet.
28356 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
28357 return SDValue();
28358
28359 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28360 SDLoc DL(N);
28361
28362 // Set the carry flag.
28363 SDValue Carry = Op.getOperand(2);
28364 EVT CarryVT = Carry.getValueType();
28365 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
28366 Carry, DAG.getAllOnesConstant(DL, CarryVT));
28367
28368 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
28369 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
28370 Op.getOperand(1), Carry.getValue(1));
28371
28372 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
28373 if (N->getValueType(1) == MVT::i1)
28374 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
28375
28376 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
28377}
28378
28379static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
28380 SelectionDAG &DAG) {
28381 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28381, __PRETTY_FUNCTION__))
;
28382
28383 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
28384 // which returns the values as { float, float } (in XMM0) or
28385 // { double, double } (which is returned in XMM0, XMM1).
28386 SDLoc dl(Op);
28387 SDValue Arg = Op.getOperand(0);
28388 EVT ArgVT = Arg.getValueType();
28389 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28390
28391 TargetLowering::ArgListTy Args;
28392 TargetLowering::ArgListEntry Entry;
28393
28394 Entry.Node = Arg;
28395 Entry.Ty = ArgTy;
28396 Entry.IsSExt = false;
28397 Entry.IsZExt = false;
28398 Args.push_back(Entry);
28399
28400 bool isF64 = ArgVT == MVT::f64;
28401 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
28402 // the small struct {f32, f32} is returned in (eax, edx). For f64,
28403 // the results are returned via SRet in memory.
28404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28405 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
28406 const char *LibcallName = TLI.getLibcallName(LC);
28407 SDValue Callee =
28408 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
28409
28410 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
28411 : (Type *)VectorType::get(ArgTy, 4);
28412
28413 TargetLowering::CallLoweringInfo CLI(DAG);
28414 CLI.setDebugLoc(dl)
28415 .setChain(DAG.getEntryNode())
28416 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
28417
28418 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
28419
28420 if (isF64)
28421 // Returned in xmm0 and xmm1.
28422 return CallResult.first;
28423
28424 // Returned in bits 0:31 and 32:64 xmm0.
28425 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
28426 CallResult.first, DAG.getIntPtrConstant(0, dl));
28427 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
28428 CallResult.first, DAG.getIntPtrConstant(1, dl));
28429 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
28430 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
28431}
28432
28433/// Widen a vector input to a vector of NVT. The
28434/// input vector must have the same element type as NVT.
28435static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
28436 bool FillWithZeroes = false) {
28437 // Check if InOp already has the right width.
28438 MVT InVT = InOp.getSimpleValueType();
28439 if (InVT == NVT)
28440 return InOp;
28441
28442 if (InOp.isUndef())
28443 return DAG.getUNDEF(NVT);
28444
28445 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28446, __PRETTY_FUNCTION__))
28446 "input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28446, __PRETTY_FUNCTION__))
;
28447
28448 unsigned InNumElts = InVT.getVectorNumElements();
28449 unsigned WidenNumElts = NVT.getVectorNumElements();
28450 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28451, __PRETTY_FUNCTION__))
28451 "Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28451, __PRETTY_FUNCTION__))
;
28452
28453 SDLoc dl(InOp);
28454 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
28455 InOp.getNumOperands() == 2) {
28456 SDValue N1 = InOp.getOperand(1);
28457 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
28458 N1.isUndef()) {
28459 InOp = InOp.getOperand(0);
28460 InVT = InOp.getSimpleValueType();
28461 InNumElts = InVT.getVectorNumElements();
28462 }
28463 }
28464 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
28465 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
28466 SmallVector<SDValue, 16> Ops;
28467 for (unsigned i = 0; i < InNumElts; ++i)
28468 Ops.push_back(InOp.getOperand(i));
28469
28470 EVT EltVT = InOp.getOperand(0).getValueType();
28471
28472 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
28473 DAG.getUNDEF(EltVT);
28474 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
28475 Ops.push_back(FillVal);
28476 return DAG.getBuildVector(NVT, dl, Ops);
28477 }
28478 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
28479 DAG.getUNDEF(NVT);
28480 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
28481 InOp, DAG.getIntPtrConstant(0, dl));
28482}
28483
28484static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
28485 SelectionDAG &DAG) {
28486 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28487, __PRETTY_FUNCTION__))
28487 "MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28487, __PRETTY_FUNCTION__))
;
28488
28489 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
28490 SDValue Src = N->getValue();
28491 MVT VT = Src.getSimpleValueType();
28492 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28492, __PRETTY_FUNCTION__))
;
28493 SDLoc dl(Op);
28494
28495 SDValue Scale = N->getScale();
28496 SDValue Index = N->getIndex();
28497 SDValue Mask = N->getMask();
28498 SDValue Chain = N->getChain();
28499 SDValue BasePtr = N->getBasePtr();
28500
28501 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
28502 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28502, __PRETTY_FUNCTION__))
;
28503 // If the index is v2i64 and we have VLX we can use xmm for data and index.
28504 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
28505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28506 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
28507 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
28508 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
28509 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
28510 SDValue NewScatter = DAG.getMemIntrinsicNode(
28511 X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand());
28512 return SDValue(NewScatter.getNode(), 1);
28513 }
28514 return SDValue();
28515 }
28516
28517 MVT IndexVT = Index.getSimpleValueType();
28518 MVT MaskVT = Mask.getSimpleValueType();
28519
28520 // If the index is v2i32, we're being called by type legalization and we
28521 // should just let the default handling take care of it.
28522 if (IndexVT == MVT::v2i32)
28523 return SDValue();
28524
28525 // If we don't have VLX and neither the passthru or index is 512-bits, we
28526 // need to widen until one is.
28527 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
28528 !Index.getSimpleValueType().is512BitVector()) {
28529 // Determine how much we need to widen by to get a 512-bit type.
28530 unsigned Factor = std::min(512/VT.getSizeInBits(),
28531 512/IndexVT.getSizeInBits());
28532 unsigned NumElts = VT.getVectorNumElements() * Factor;
28533
28534 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
28535 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
28536 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
28537
28538 Src = ExtendToType(Src, VT, DAG);
28539 Index = ExtendToType(Index, IndexVT, DAG);
28540 Mask = ExtendToType(Mask, MaskVT, DAG, true);
28541 }
28542
28543 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
28544 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
28545 SDValue NewScatter = DAG.getMemIntrinsicNode(
28546 X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand());
28547 return SDValue(NewScatter.getNode(), 1);
28548}
28549
28550static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
28551 SelectionDAG &DAG) {
28552
28553 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
28554 MVT VT = Op.getSimpleValueType();
28555 MVT ScalarVT = VT.getScalarType();
28556 SDValue Mask = N->getMask();
28557 MVT MaskVT = Mask.getSimpleValueType();
28558 SDValue PassThru = N->getPassThru();
28559 SDLoc dl(Op);
28560
28561 // Handle AVX masked loads which don't support passthru other than 0.
28562 if (MaskVT.getVectorElementType() != MVT::i1) {
28563 // We also allow undef in the isel pattern.
28564 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
28565 return Op;
28566
28567 SDValue NewLoad = DAG.getMaskedLoad(
28568 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
28569 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
28570 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
28571 N->isExpandingLoad());
28572 // Emit a blend.
28573 SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
28574 PassThru);
28575 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
28576 }
28577
28578 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28579, __PRETTY_FUNCTION__))
28579 "Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28579, __PRETTY_FUNCTION__))
;
28580
28581 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28582, __PRETTY_FUNCTION__))
28582 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28582, __PRETTY_FUNCTION__))
;
28583
28584 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28585, __PRETTY_FUNCTION__))
28585 "Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28585, __PRETTY_FUNCTION__))
;
28586
28587 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28590, __PRETTY_FUNCTION__))
28588 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28590, __PRETTY_FUNCTION__))
28589 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28590, __PRETTY_FUNCTION__))
28590 "Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28590, __PRETTY_FUNCTION__))
;
28591
28592 // This operation is legal for targets with VLX, but without
28593 // VLX the vector should be widened to 512 bit
28594 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
28595 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
28596 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
28597
28598 // Mask element has to be i1.
28599 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28600, __PRETTY_FUNCTION__))
28600 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28600, __PRETTY_FUNCTION__))
;
28601
28602 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
28603
28604 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
28605 SDValue NewLoad = DAG.getMaskedLoad(
28606 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
28607 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
28608 N->getExtensionType(), N->isExpandingLoad());
28609
28610 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
28611 NewLoad.getValue(0),
28612 DAG.getIntPtrConstant(0, dl));
28613 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
28614 return DAG.getMergeValues(RetOps, dl);
28615}
28616
28617static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
28618 SelectionDAG &DAG) {
28619 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
28620 SDValue DataToStore = N->getValue();
28621 MVT VT = DataToStore.getSimpleValueType();
28622 MVT ScalarVT = VT.getScalarType();
28623 SDValue Mask = N->getMask();
28624 SDLoc dl(Op);
28625
28626 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28627, __PRETTY_FUNCTION__))
28627 "Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28627, __PRETTY_FUNCTION__))
;
28628
28629 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28630, __PRETTY_FUNCTION__))
28630 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28630, __PRETTY_FUNCTION__))
;
28631
28632 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28633, __PRETTY_FUNCTION__))
28633 "Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28633, __PRETTY_FUNCTION__))
;
28634
28635 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28638, __PRETTY_FUNCTION__))
28636 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28638, __PRETTY_FUNCTION__))
28637 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28638, __PRETTY_FUNCTION__))
28638 "Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28638, __PRETTY_FUNCTION__))
;
28639
28640 // This operation is legal for targets with VLX, but without
28641 // VLX the vector should be widened to 512 bit
28642 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
28643 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
28644
28645 // Mask element has to be i1.
28646 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28647, __PRETTY_FUNCTION__))
28647 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28647, __PRETTY_FUNCTION__))
;
28648
28649 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
28650
28651 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
28652 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
28653 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
28654 N->getOffset(), Mask, N->getMemoryVT(),
28655 N->getMemOperand(), N->getAddressingMode(),
28656 N->isTruncatingStore(), N->isCompressingStore());
28657}
28658
28659static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
28660 SelectionDAG &DAG) {
28661 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28662, __PRETTY_FUNCTION__))
28662 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28662, __PRETTY_FUNCTION__))
;
28663
28664 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
28665 SDLoc dl(Op);
28666 MVT VT = Op.getSimpleValueType();
28667 SDValue Index = N->getIndex();
28668 SDValue Mask = N->getMask();
28669 SDValue PassThru = N->getPassThru();
28670 MVT IndexVT = Index.getSimpleValueType();
28671 MVT MaskVT = Mask.getSimpleValueType();
28672
28673 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28673, __PRETTY_FUNCTION__))
;
28674
28675 // If the index is v2i32, we're being called by type legalization.
28676 if (IndexVT == MVT::v2i32)
28677 return SDValue();
28678
28679 // If we don't have VLX and neither the passthru or index is 512-bits, we
28680 // need to widen until one is.
28681 MVT OrigVT = VT;
28682 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
28683 !IndexVT.is512BitVector()) {
28684 // Determine how much we need to widen by to get a 512-bit type.
28685 unsigned Factor = std::min(512/VT.getSizeInBits(),
28686 512/IndexVT.getSizeInBits());
28687
28688 unsigned NumElts = VT.getVectorNumElements() * Factor;
28689
28690 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
28691 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
28692 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
28693
28694 PassThru = ExtendToType(PassThru, VT, DAG);
28695 Index = ExtendToType(Index, IndexVT, DAG);
28696 Mask = ExtendToType(Mask, MaskVT, DAG, true);
28697 }
28698
28699 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
28700 N->getScale() };
28701 SDValue NewGather = DAG.getMemIntrinsicNode(
28702 X86ISD::MGATHER, dl, DAG.getVTList(VT, MaskVT, MVT::Other), Ops,
28703 N->getMemoryVT(), N->getMemOperand());
28704 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
28705 NewGather, DAG.getIntPtrConstant(0, dl));
28706 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
28707}
28708
28709static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
28710 SDLoc dl(Op);
28711 SDValue Src = Op.getOperand(0);
28712 MVT DstVT = Op.getSimpleValueType();
28713
28714 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
28715 unsigned SrcAS = N->getSrcAddressSpace();
28716
28717 assert(SrcAS != N->getDestAddressSpace() &&((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28718, __PRETTY_FUNCTION__))
28718 "addrspacecast must be between different address spaces")((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28718, __PRETTY_FUNCTION__))
;
28719
28720 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
28721 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
28722 } else if (DstVT == MVT::i64) {
28723 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
28724 } else if (DstVT == MVT::i32) {
28725 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
28726 } else {
28727 report_fatal_error("Bad address space in addrspacecast");
28728 }
28729 return Op;
28730}
28731
28732SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
28733 SelectionDAG &DAG) const {
28734 // TODO: Eventually, the lowering of these nodes should be informed by or
28735 // deferred to the GC strategy for the function in which they appear. For
28736 // now, however, they must be lowered to something. Since they are logically
28737 // no-ops in the case of a null GC strategy (or a GC strategy which does not
28738 // require special handling for these nodes), lower them as literal NOOPs for
28739 // the time being.
28740 SmallVector<SDValue, 2> Ops;
28741
28742 Ops.push_back(Op.getOperand(0));
28743 if (Op->getGluedNode())
28744 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
28745
28746 SDLoc OpDL(Op);
28747 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
28748 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
28749
28750 return NOOP;
28751}
28752
28753SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
28754 RTLIB::Libcall Call) const {
28755
28756 bool IsStrict = Op->isStrictFPOpcode();
28757 unsigned Offset = IsStrict ? 1 : 0;
28758 SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
28759
28760 SDLoc dl(Op);
28761 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
28762 MakeLibCallOptions CallOptions;
28763 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
28764 CallOptions, dl, Chain);
28765
28766 if (IsStrict)
28767 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
28768
28769 return Tmp.first;
28770}
28771
28772// Custom split CVTPS2PH with wide types.
28773static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
28774 SDLoc dl(Op);
28775 EVT VT = Op.getValueType();
28776 SDValue Lo, Hi;
28777 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
28778 EVT LoVT, HiVT;
28779 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
28780 SDValue RC = Op.getOperand(1);
28781 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
28782 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
28783 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28784}
28785
28786/// Provide custom lowering hooks for some operations.
28787SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
28788 switch (Op.getOpcode()) {
28789 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28789)
;
28790 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
28791 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
28792 return LowerCMP_SWAP(Op, Subtarget, DAG);
28793 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
28794 case ISD::ATOMIC_LOAD_ADD:
28795 case ISD::ATOMIC_LOAD_SUB:
28796 case ISD::ATOMIC_LOAD_OR:
28797 case ISD::ATOMIC_LOAD_XOR:
28798 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
28799 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
28800 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
28801 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
28802 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
28803 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
28804 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
28805 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
28806 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
28807 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
28808 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
28809 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
28810 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
28811 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
28812 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
28813 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
28814 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
28815 case ISD::SHL_PARTS:
28816 case ISD::SRA_PARTS:
28817 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
28818 case ISD::FSHL:
28819 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
28820 case ISD::STRICT_SINT_TO_FP:
28821 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
28822 case ISD::STRICT_UINT_TO_FP:
28823 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
28824 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
28825 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
28826 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
28827 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
28828 case ISD::ZERO_EXTEND_VECTOR_INREG:
28829 case ISD::SIGN_EXTEND_VECTOR_INREG:
28830 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
28831 case ISD::FP_TO_SINT:
28832 case ISD::STRICT_FP_TO_SINT:
28833 case ISD::FP_TO_UINT:
28834 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
28835 case ISD::FP_EXTEND:
28836 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
28837 case ISD::FP_ROUND:
28838 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
28839 case ISD::FP16_TO_FP:
28840 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
28841 case ISD::FP_TO_FP16:
28842 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
28843 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
28844 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
28845 case ISD::FADD:
28846 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
28847 case ISD::FROUND: return LowerFROUND(Op, DAG);
28848 case ISD::FABS:
28849 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
28850 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
28851 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
28852 case ISD::LRINT:
28853 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
28854 case ISD::SETCC:
28855 case ISD::STRICT_FSETCC:
28856 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
28857 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
28858 case ISD::SELECT: return LowerSELECT(Op, DAG);
28859 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
28860 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
28861 case ISD::VASTART: return LowerVASTART(Op, DAG);
28862 case ISD::VAARG: return LowerVAARG(Op, DAG);
28863 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
28864 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
28865 case ISD::INTRINSIC_VOID:
28866 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
28867 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
28868 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
28869 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
28870 case ISD::FRAME_TO_ARGS_OFFSET:
28871 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
28872 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
28873 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
28874 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
28875 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
28876 case ISD::EH_SJLJ_SETUP_DISPATCH:
28877 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
28878 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
28879 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
28880 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
28881 case ISD::CTLZ:
28882 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
28883 case ISD::CTTZ:
28884 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
28885 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
28886 case ISD::MULHS:
28887 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
28888 case ISD::ROTL:
28889 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
28890 case ISD::SRA:
28891 case ISD::SRL:
28892 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
28893 case ISD::SADDO:
28894 case ISD::UADDO:
28895 case ISD::SSUBO:
28896 case ISD::USUBO:
28897 case ISD::SMULO:
28898 case ISD::UMULO: return LowerXALUO(Op, DAG);
28899 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
28900 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
28901 case ISD::ADDCARRY:
28902 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
28903 case ISD::ADD:
28904 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
28905 case ISD::UADDSAT:
28906 case ISD::SADDSAT:
28907 case ISD::USUBSAT:
28908 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
28909 case ISD::SMAX:
28910 case ISD::SMIN:
28911 case ISD::UMAX:
28912 case ISD::UMIN: return LowerMINMAX(Op, DAG);
28913 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
28914 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
28915 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
28916 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
28917 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
28918 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
28919 case ISD::GC_TRANSITION_START:
28920 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
28921 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
28922 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
28923 }
28924}
28925
28926/// Places new result values for the node in Results (their number
28927/// and types must exactly match those of the original return values of
28928/// the node), or leaves Results empty, which indicates that the node is not
28929/// to be custom lowered after all.
28930void X86TargetLowering::LowerOperationWrapper(SDNode *N,
28931 SmallVectorImpl<SDValue> &Results,
28932 SelectionDAG &DAG) const {
28933 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
28934
28935 if (!Res.getNode())
28936 return;
28937
28938 // If the original node has one result, take the return value from
28939 // LowerOperation as is. It might not be result number 0.
28940 if (N->getNumValues() == 1) {
28941 Results.push_back(Res);
28942 return;
28943 }
28944
28945 // If the original node has multiple results, then the return node should
28946 // have the same number of results.
28947 assert((N->getNumValues() == Res->getNumValues()) &&(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __PRETTY_FUNCTION__))
28948 "Lowering returned the wrong number of results!")(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28948, __PRETTY_FUNCTION__))
;
28949
28950 // Places new result values base on N result number.
28951 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
28952 Results.push_back(Res.getValue(I));
28953}
28954
28955/// Replace a node with an illegal result type with a new node built out of
28956/// custom code.
28957void X86TargetLowering::ReplaceNodeResults(SDNode *N,
28958 SmallVectorImpl<SDValue>&Results,
28959 SelectionDAG &DAG) const {
28960 SDLoc dl(N);
28961 switch (N->getOpcode()) {
28962 default:
28963#ifndef NDEBUG
28964 dbgs() << "ReplaceNodeResults: ";
28965 N->dump(&DAG);
28966#endif
28967 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28967)
;
28968 case X86ISD::CVTPH2PS: {
28969 EVT VT = N->getValueType(0);
28970 SDValue Lo, Hi;
28971 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28972 EVT LoVT, HiVT;
28973 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
28974 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
28975 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
28976 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28977 Results.push_back(Res);
28978 return;
28979 }
28980 case ISD::CTPOP: {
28981 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28981, __PRETTY_FUNCTION__))
;
28982 // Use a v2i64 if possible.
28983 bool NoImplicitFloatOps =
28984 DAG.getMachineFunction().getFunction().hasFnAttribute(
28985 Attribute::NoImplicitFloat);
28986 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
28987 SDValue Wide =
28988 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
28989 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
28990 // Bit count should fit in 32-bits, extract it as that and then zero
28991 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
28992 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
28993 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
28994 DAG.getIntPtrConstant(0, dl));
28995 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
28996 Results.push_back(Wide);
28997 }
28998 return;
28999 }
29000 case ISD::MUL: {
29001 EVT VT = N->getValueType(0);
29002 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29003, __PRETTY_FUNCTION__))
29003 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29003, __PRETTY_FUNCTION__))
;
29004 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
29005 // elements are needed.
29006 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29007 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
29008 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
29009 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
29010 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
29011 unsigned NumConcats = 16 / VT.getVectorNumElements();
29012 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
29013 ConcatOps[0] = Res;
29014 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
29015 Results.push_back(Res);
29016 return;
29017 }
29018 case X86ISD::VPMADDWD:
29019 case X86ISD::AVG: {
29020 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
29021 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29021, __PRETTY_FUNCTION__))
;
29022
29023 EVT VT = N->getValueType(0);
29024 EVT InVT = N->getOperand(0).getValueType();
29025 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29026, __PRETTY_FUNCTION__))
29026 "Expected a VT that divides into 128 bits.")((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29026, __PRETTY_FUNCTION__))
;
29027 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29028, __PRETTY_FUNCTION__))
29028 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29028, __PRETTY_FUNCTION__))
;
29029 unsigned NumConcat = 128 / InVT.getSizeInBits();
29030
29031 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
29032 InVT.getVectorElementType(),
29033 NumConcat * InVT.getVectorNumElements());
29034 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
29035 VT.getVectorElementType(),
29036 NumConcat * VT.getVectorNumElements());
29037
29038 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
29039 Ops[0] = N->getOperand(0);
29040 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29041 Ops[0] = N->getOperand(1);
29042 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29043
29044 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
29045 Results.push_back(Res);
29046 return;
29047 }
29048 case ISD::ABS: {
29049 assert(N->getValueType(0) == MVT::i64 &&((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29050, __PRETTY_FUNCTION__))
29050 "Unexpected type (!= i64) on ABS.")((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29050, __PRETTY_FUNCTION__))
;
29051 MVT HalfT = MVT::i32;
29052 SDValue Lo, Hi, Tmp;
29053 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
29054
29055 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
29056 DAG.getConstant(0, dl, HalfT));
29057 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
29058 DAG.getConstant(1, dl, HalfT));
29059 Tmp = DAG.getNode(
29060 ISD::SRA, dl, HalfT, Hi,
29061 DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
29062 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
29063 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
29064 SDValue(Lo.getNode(), 1));
29065 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
29066 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
29067 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
29068 return;
29069 }
29070 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
29071 case X86ISD::FMINC:
29072 case X86ISD::FMIN:
29073 case X86ISD::FMAXC:
29074 case X86ISD::FMAX: {
29075 EVT VT = N->getValueType(0);
29076 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29076, __PRETTY_FUNCTION__))
;
29077 SDValue UNDEF = DAG.getUNDEF(VT);
29078 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29079 N->getOperand(0), UNDEF);
29080 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29081 N->getOperand(1), UNDEF);
29082 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
29083 return;
29084 }
29085 case ISD::SDIV:
29086 case ISD::UDIV:
29087 case ISD::SREM:
29088 case ISD::UREM: {
29089 EVT VT = N->getValueType(0);
29090 if (VT.isVector()) {
29091 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29092, __PRETTY_FUNCTION__))
29092 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29092, __PRETTY_FUNCTION__))
;
29093 // If this RHS is a constant splat vector we can widen this and let
29094 // division/remainder by constant optimize it.
29095 // TODO: Can we do something for non-splat?
29096 APInt SplatVal;
29097 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
29098 unsigned NumConcats = 128 / VT.getSizeInBits();
29099 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
29100 Ops0[0] = N->getOperand(0);
29101 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
29102 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
29103 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
29104 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
29105 Results.push_back(Res);
29106 }
29107 return;
29108 }
29109
29110 LLVM_FALLTHROUGH[[gnu::fallthrough]];
29111 }
29112 case ISD::SDIVREM:
29113 case ISD::UDIVREM: {
29114 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
29115 Results.push_back(V);
29116 return;
29117 }
29118 case ISD::TRUNCATE: {
29119 MVT VT = N->getSimpleValueType(0);
29120 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
29121 return;
29122
29123 // The generic legalizer will try to widen the input type to the same
29124 // number of elements as the widened result type. But this isn't always
29125 // the best thing so do some custom legalization to avoid some cases.
29126 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
29127 SDValue In = N->getOperand(0);
29128 EVT InVT = In.getValueType();
29129
29130 unsigned InBits = InVT.getSizeInBits();
29131 if (128 % InBits == 0) {
29132 // 128 bit and smaller inputs should avoid truncate all together and
29133 // just use a build_vector that will become a shuffle.
29134 // TODO: Widen and use a shuffle directly?
29135 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
29136 EVT EltVT = VT.getVectorElementType();
29137 unsigned WidenNumElts = WidenVT.getVectorNumElements();
29138 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
29139 // Use the original element count so we don't do more scalar opts than
29140 // necessary.
29141 unsigned MinElts = VT.getVectorNumElements();
29142 for (unsigned i=0; i < MinElts; ++i) {
29143 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
29144 DAG.getIntPtrConstant(i, dl));
29145 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
29146 }
29147 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
29148 return;
29149 }
29150 // With AVX512 there are some cases that can use a target specific
29151 // truncate node to go from 256/512 to less than 128 with zeros in the
29152 // upper elements of the 128 bit result.
29153 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
29154 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
29155 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
29156 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
29157 return;
29158 }
29159 // There's one case we can widen to 512 bits and use VTRUNC.
29160 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
29161 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
29162 DAG.getUNDEF(MVT::v4i64));
29163 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
29164 return;
29165 }
29166 }
29167 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
29168 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
29169 isTypeLegal(MVT::v4i64)) {
29170 // Input needs to be split and output needs to widened. Let's use two
29171 // VTRUNCs, and shuffle their results together into the wider type.
29172 SDValue Lo, Hi;
29173 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
29174
29175 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
29176 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
29177 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
29178 { 0, 1, 2, 3, 16, 17, 18, 19,
29179 -1, -1, -1, -1, -1, -1, -1, -1 });
29180 Results.push_back(Res);
29181 return;
29182 }
29183
29184 return;
29185 }
29186 case ISD::ANY_EXTEND:
29187 // Right now, only MVT::v8i8 has Custom action for an illegal type.
29188 // It's intended to custom handle the input type.
29189 assert(N->getValueType(0) == MVT::v8i8 &&((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29190, __PRETTY_FUNCTION__))
29190 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29190, __PRETTY_FUNCTION__))
;
29191 return;
29192 case ISD::SIGN_EXTEND:
29193 case ISD::ZERO_EXTEND: {
29194 EVT VT = N->getValueType(0);
29195 SDValue In = N->getOperand(0);
29196 EVT InVT = In.getValueType();
29197 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
29198 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
29199 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29200, __PRETTY_FUNCTION__))
29200 "Unexpected type action!")((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29200, __PRETTY_FUNCTION__))
;
29201 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29201, __PRETTY_FUNCTION__))
;
29202 // Custom split this so we can extend i8/i16->i32 invec. This is better
29203 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
29204 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
29205 // we allow the sra from the extend to i32 to be shared by the split.
29206 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
29207
29208 // Fill a vector with sign bits for each element.
29209 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
29210 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
29211
29212 // Create an unpackl and unpackh to interleave the sign bits then bitcast
29213 // to v2i64.
29214 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
29215 {0, 4, 1, 5});
29216 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
29217 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
29218 {2, 6, 3, 7});
29219 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
29220
29221 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29222 Results.push_back(Res);
29223 return;
29224 }
29225
29226 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
29227 if (!InVT.is128BitVector()) {
29228 // Not a 128 bit vector, but maybe type legalization will promote
29229 // it to 128 bits.
29230 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
29231 return;
29232 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
29233 if (!InVT.is128BitVector())
29234 return;
29235
29236 // Promote the input to 128 bits. Type legalization will turn this into
29237 // zext_inreg/sext_inreg.
29238 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
29239 }
29240
29241 // Perform custom splitting instead of the two stage extend we would get
29242 // by default.
29243 EVT LoVT, HiVT;
29244 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
29245 assert(isTypeLegal(LoVT) && "Split VT not legal?")((isTypeLegal(LoVT) && "Split VT not legal?") ? static_cast
<void> (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29245, __PRETTY_FUNCTION__))
;
29246
29247 SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
29248
29249 // We need to shift the input over by half the number of elements.
29250 unsigned NumElts = InVT.getVectorNumElements();
29251 unsigned HalfNumElts = NumElts / 2;
29252 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
29253 for (unsigned i = 0; i != HalfNumElts; ++i)
29254 ShufMask[i] = i + HalfNumElts;
29255
29256 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
29257 Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
29258
29259 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29260 Results.push_back(Res);
29261 }
29262 return;
29263 }
29264 case ISD::FP_TO_SINT:
29265 case ISD::STRICT_FP_TO_SINT:
29266 case ISD::FP_TO_UINT:
29267 case ISD::STRICT_FP_TO_UINT: {
29268 bool IsStrict = N->isStrictFPOpcode();
29269 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
29270 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
29271 EVT VT = N->getValueType(0);
29272 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29273 EVT SrcVT = Src.getValueType();
29274
29275 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
29276 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29277, __PRETTY_FUNCTION__))
29277 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29277, __PRETTY_FUNCTION__))
;
29278
29279 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
29280 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
29281 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
29282 VT.getVectorNumElements());
29283 SDValue Res;
29284 SDValue Chain;
29285 if (IsStrict) {
29286 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
29287 {N->getOperand(0), Src});
29288 Chain = Res.getValue(1);
29289 } else
29290 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
29291
29292 // Preserve what we know about the size of the original result. Except
29293 // when the result is v2i32 since we can't widen the assert.
29294 if (PromoteVT != MVT::v2i32)
29295 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
29296 dl, PromoteVT, Res,
29297 DAG.getValueType(VT.getVectorElementType()));
29298
29299 // Truncate back to the original width.
29300 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
29301
29302 // Now widen to 128 bits.
29303 unsigned NumConcats = 128 / VT.getSizeInBits();
29304 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
29305 VT.getVectorNumElements() * NumConcats);
29306 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
29307 ConcatOps[0] = Res;
29308 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
29309 Results.push_back(Res);
29310 if (IsStrict)
29311 Results.push_back(Chain);
29312 return;
29313 }
29314
29315
29316 if (VT == MVT::v2i32) {
29317 assert((IsSigned || Subtarget.hasAVX512()) &&(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29318, __PRETTY_FUNCTION__))
29318 "Can only handle signed conversion without AVX512")(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29318, __PRETTY_FUNCTION__))
;
29319 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29319, __PRETTY_FUNCTION__))
;
29320 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29321, __PRETTY_FUNCTION__))
29321 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29321, __PRETTY_FUNCTION__))
;
29322 if (Src.getValueType() == MVT::v2f64) {
29323 unsigned Opc;
29324 if (IsStrict)
29325 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
29326 else
29327 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
29328
29329 // If we have VLX we can emit a target specific FP_TO_UINT node,.
29330 if (!IsSigned && !Subtarget.hasVLX()) {
29331 // Otherwise we can defer to the generic legalizer which will widen
29332 // the input as well. This will be further widened during op
29333 // legalization to v8i32<-v8f64.
29334 // For strict nodes we'll need to widen ourselves.
29335 // FIXME: Fix the type legalizer to safely widen strict nodes?
29336 if (!IsStrict)
29337 return;
29338 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
29339 DAG.getConstantFP(0.0, dl, MVT::v2f64));
29340 Opc = N->getOpcode();
29341 }
29342 SDValue Res;
29343 SDValue Chain;
29344 if (IsStrict) {
29345 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
29346 {N->getOperand(0), Src});
29347 Chain = Res.getValue(1);
29348 } else {
29349 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
29350 }
29351 Results.push_back(Res);
29352 if (IsStrict)
29353 Results.push_back(Chain);
29354 return;
29355 }
29356
29357 // Custom widen strict v2f32->v2i32 by padding with zeros.
29358 // FIXME: Should generic type legalizer do this?
29359 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
29360 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
29361 DAG.getConstantFP(0.0, dl, MVT::v2f32));
29362 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
29363 {N->getOperand(0), Src});
29364 Results.push_back(Res);
29365 Results.push_back(Res.getValue(1));
29366 return;
29367 }
29368
29369 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
29370 // so early out here.
29371 return;
29372 }
29373
29374 assert(!VT.isVector() && "Vectors should have been handled above!")((!VT.isVector() && "Vectors should have been handled above!"
) ? static_cast<void> (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29374, __PRETTY_FUNCTION__))
;
29375
29376 if (Subtarget.hasDQI() && VT == MVT::i64 &&
29377 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
29378 assert(!Subtarget.is64Bit() && "i64 should be legal")((!Subtarget.is64Bit() && "i64 should be legal") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29378, __PRETTY_FUNCTION__))
;
29379 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
29380 // If we use a 128-bit result we might need to use a target specific node.
29381 unsigned SrcElts =
29382 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
29383 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
29384 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
29385 unsigned Opc = N->getOpcode();
29386 if (NumElts != SrcElts) {
29387 if (IsStrict)
29388 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
29389 else
29390 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
29391 }
29392
29393 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
29394 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
29395 DAG.getConstantFP(0.0, dl, VecInVT), Src,
29396 ZeroIdx);
29397 SDValue Chain;
29398 if (IsStrict) {
29399 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
29400 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
29401 Chain = Res.getValue(1);
29402 } else
29403 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
29404 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
29405 Results.push_back(Res);
29406 if (IsStrict)
29407 Results.push_back(Chain);
29408 return;
29409 }
29410
29411 SDValue Chain;
29412 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
29413 Results.push_back(V);
29414 if (IsStrict)
29415 Results.push_back(Chain);
29416 }
29417 return;
29418 }
29419 case ISD::LRINT:
29420 case ISD::LLRINT: {
29421 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
29422 Results.push_back(V);
29423 return;
29424 }
29425
29426 case ISD::SINT_TO_FP:
29427 case ISD::STRICT_SINT_TO_FP:
29428 case ISD::UINT_TO_FP:
29429 case ISD::STRICT_UINT_TO_FP: {
29430 bool IsStrict = N->isStrictFPOpcode();
29431 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
29432 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
29433 EVT VT = N->getValueType(0);
29434 if (VT != MVT::v2f32)
29435 return;
29436 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29437 EVT SrcVT = Src.getValueType();
29438 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
29439 if (IsStrict) {
29440 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
29441 : X86ISD::STRICT_CVTUI2P;
29442 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
29443 {N->getOperand(0), Src});
29444 Results.push_back(Res);
29445 Results.push_back(Res.getValue(1));
29446 } else {
29447 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
29448 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
29449 }
29450 return;
29451 }
29452 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
29453 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
29454 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
29455 SDValue One = DAG.getConstant(1, dl, SrcVT);
29456 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
29457 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
29458 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
29459 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
29460 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
29461 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
29462 for (int i = 0; i != 2; ++i) {
29463 SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
29464 SignSrc, DAG.getIntPtrConstant(i, dl));
29465 if (IsStrict)
29466 SignCvts[i] =
29467 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
29468 {N->getOperand(0), Src});
29469 else
29470 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
29471 };
29472 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
29473 SDValue Slow, Chain;
29474 if (IsStrict) {
29475 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
29476 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
29477 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
29478 {Chain, SignCvt, SignCvt});
29479 Chain = Slow.getValue(1);
29480 } else {
29481 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
29482 }
29483 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
29484 IsNeg =
29485 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
29486 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
29487 Results.push_back(Cvt);
29488 if (IsStrict)
29489 Results.push_back(Chain);
29490 return;
29491 }
29492
29493 if (SrcVT != MVT::v2i32)
29494 return;
29495
29496 if (IsSigned || Subtarget.hasAVX512()) {
29497 if (!IsStrict)
29498 return;
29499
29500 // Custom widen strict v2i32->v2f32 to avoid scalarization.
29501 // FIXME: Should generic type legalizer do this?
29502 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
29503 DAG.getConstant(0, dl, MVT::v2i32));
29504 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
29505 {N->getOperand(0), Src});
29506 Results.push_back(Res);
29507 Results.push_back(Res.getValue(1));
29508 return;
29509 }
29510
29511 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29511, __PRETTY_FUNCTION__))
;
29512 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
29513 SDValue VBias =
29514 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
29515 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
29516 DAG.getBitcast(MVT::v2i64, VBias));
29517 Or = DAG.getBitcast(MVT::v2f64, Or);
29518 if (IsStrict) {
29519 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
29520 {N->getOperand(0), Or, VBias});
29521 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
29522 {MVT::v4f32, MVT::Other},
29523 {Sub.getValue(1), Sub});
29524 Results.push_back(Res);
29525 Results.push_back(Res.getValue(1));
29526 } else {
29527 // TODO: Are there any fast-math-flags to propagate here?
29528 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
29529 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
29530 }
29531 return;
29532 }
29533 case ISD::STRICT_FP_ROUND:
29534 case ISD::FP_ROUND: {
29535 bool IsStrict = N->isStrictFPOpcode();
29536 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29537 if (!isTypeLegal(Src.getValueType()))
29538 return;
29539 SDValue V;
29540 if (IsStrict)
29541 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
29542 {N->getOperand(0), N->getOperand(1)});
29543 else
29544 V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
29545 Results.push_back(V);
29546 if (IsStrict)
29547 Results.push_back(V.getValue(1));
29548 return;
29549 }
29550 case ISD::FP_EXTEND: {
29551 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
29552 // No other ValueType for FP_EXTEND should reach this point.
29553 assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29554, __PRETTY_FUNCTION__))
29554 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29554, __PRETTY_FUNCTION__))
;
29555 return;
29556 }
29557 case ISD::INTRINSIC_W_CHAIN: {
29558 unsigned IntNo = N->getConstantOperandVal(1);
29559 switch (IntNo) {
29560 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29561)
29561 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29561)
;
29562 case Intrinsic::x86_rdtsc:
29563 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
29564 Results);
29565 case Intrinsic::x86_rdtscp:
29566 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
29567 Results);
29568 case Intrinsic::x86_rdpmc:
29569 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
29570 Results);
29571 return;
29572 case Intrinsic::x86_xgetbv:
29573 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
29574 Results);
29575 return;
29576 }
29577 }
29578 case ISD::READCYCLECOUNTER: {
29579 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
29580 }
29581 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
29582 EVT T = N->getValueType(0);
29583 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29583, __PRETTY_FUNCTION__))
;
29584 bool Regs64bit = T == MVT::i128;
29585 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29586, __PRETTY_FUNCTION__))
29586 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29586, __PRETTY_FUNCTION__))
;
29587 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
29588 SDValue cpInL, cpInH;
29589 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
29590 DAG.getConstant(0, dl, HalfT));
29591 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
29592 DAG.getConstant(1, dl, HalfT));
29593 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
29594 Regs64bit ? X86::RAX : X86::EAX,
29595 cpInL, SDValue());
29596 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
29597 Regs64bit ? X86::RDX : X86::EDX,
29598 cpInH, cpInL.getValue(1));
29599 SDValue swapInL, swapInH;
29600 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
29601 DAG.getConstant(0, dl, HalfT));
29602 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
29603 DAG.getConstant(1, dl, HalfT));
29604 swapInH =
29605 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
29606 swapInH, cpInH.getValue(1));
29607 // If the current function needs the base pointer, RBX,
29608 // we shouldn't use cmpxchg directly.
29609 // Indeed the lowering of that instruction will clobber
29610 // that register and since RBX will be a reserved register
29611 // the register allocator will not make sure its value will
29612 // be properly saved and restored around this live-range.
29613 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
29614 SDValue Result;
29615 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29616 Register BasePtr = TRI->getBaseRegister();
29617 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
29618 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
29619 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
29620 // ISel prefers the LCMPXCHG64 variant.
29621 // If that assert breaks, that means it is not the case anymore,
29622 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
29623 // not just EBX. This is a matter of accepting i64 input for that
29624 // pseudo, and restoring into the register of the right wide
29625 // in expand pseudo. Everything else should just work.
29626 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29627, __PRETTY_FUNCTION__))
29627 "Saving only half of the RBX")((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29627, __PRETTY_FUNCTION__))
;
29628 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
29629 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
29630 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
29631 Regs64bit ? X86::RBX : X86::EBX,
29632 HalfT, swapInH.getValue(1));
29633 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
29634 RBXSave,
29635 /*Glue*/ RBXSave.getValue(2)};
29636 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
29637 } else {
29638 unsigned Opcode =
29639 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
29640 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
29641 Regs64bit ? X86::RBX : X86::EBX, swapInL,
29642 swapInH.getValue(1));
29643 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
29644 swapInL.getValue(1)};
29645 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
29646 }
29647 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
29648 Regs64bit ? X86::RAX : X86::EAX,
29649 HalfT, Result.getValue(1));
29650 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
29651 Regs64bit ? X86::RDX : X86::EDX,
29652 HalfT, cpOutL.getValue(2));
29653 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
29654
29655 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
29656 MVT::i32, cpOutH.getValue(2));
29657 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
29658 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
29659
29660 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
29661 Results.push_back(Success);
29662 Results.push_back(EFLAGS.getValue(1));
29663 return;
29664 }
29665 case ISD::ATOMIC_LOAD: {
29666 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29666, __PRETTY_FUNCTION__))
;
29667 bool NoImplicitFloatOps =
29668 DAG.getMachineFunction().getFunction().hasFnAttribute(
29669 Attribute::NoImplicitFloat);
29670 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29671 auto *Node = cast<AtomicSDNode>(N);
29672 if (Subtarget.hasSSE1()) {
29673 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
29674 // Then extract the lower 64-bits.
29675 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29676 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
29677 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
29678 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
29679 MVT::i64, Node->getMemOperand());
29680 if (Subtarget.hasSSE2()) {
29681 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
29682 DAG.getIntPtrConstant(0, dl));
29683 Results.push_back(Res);
29684 Results.push_back(Ld.getValue(1));
29685 return;
29686 }
29687 // We use an alternative sequence for SSE1 that extracts as v2f32 and
29688 // then casts to i64. This avoids a 128-bit stack temporary being
29689 // created by type legalization if we were to cast v4f32->v2i64.
29690 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
29691 DAG.getIntPtrConstant(0, dl));
29692 Res = DAG.getBitcast(MVT::i64, Res);
29693 Results.push_back(Res);
29694 Results.push_back(Ld.getValue(1));
29695 return;
29696 }
29697 if (Subtarget.hasX87()) {
29698 // First load this into an 80-bit X87 register. This will put the whole
29699 // integer into the significand.
29700 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29701 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
29702 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
29703 dl, Tys, Ops, MVT::i64,
29704 Node->getMemOperand());
29705 SDValue Chain = Result.getValue(1);
29706
29707 // Now store the X87 register to a stack temporary and convert to i64.
29708 // This store is not atomic and doesn't need to be.
29709 // FIXME: We don't need a stack temporary if the result of the load
29710 // is already being stored. We could just directly store there.
29711 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29712 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29713 MachinePointerInfo MPI =
29714 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29715 SDValue StoreOps[] = { Chain, Result, StackPtr };
29716 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
29717 DAG.getVTList(MVT::Other), StoreOps,
29718 MVT::i64, MPI, 0 /*Align*/,
29719 MachineMemOperand::MOStore);
29720
29721 // Finally load the value back from the stack temporary and return it.
29722 // This load is not atomic and doesn't need to be.
29723 // This load will be further type legalized.
29724 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
29725 Results.push_back(Result);
29726 Results.push_back(Result.getValue(1));
29727 return;
29728 }
29729 }
29730 // TODO: Use MOVLPS when SSE1 is available?
29731 // Delegate to generic TypeLegalization. Situations we can really handle
29732 // should have already been dealt with by AtomicExpandPass.cpp.
29733 break;
29734 }
29735 case ISD::ATOMIC_SWAP:
29736 case ISD::ATOMIC_LOAD_ADD:
29737 case ISD::ATOMIC_LOAD_SUB:
29738 case ISD::ATOMIC_LOAD_AND:
29739 case ISD::ATOMIC_LOAD_OR:
29740 case ISD::ATOMIC_LOAD_XOR:
29741 case ISD::ATOMIC_LOAD_NAND:
29742 case ISD::ATOMIC_LOAD_MIN:
29743 case ISD::ATOMIC_LOAD_MAX:
29744 case ISD::ATOMIC_LOAD_UMIN:
29745 case ISD::ATOMIC_LOAD_UMAX:
29746 // Delegate to generic TypeLegalization. Situations we can really handle
29747 // should have already been dealt with by AtomicExpandPass.cpp.
29748 break;
29749
29750 case ISD::BITCAST: {
29751 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29751, __PRETTY_FUNCTION__))
;
29752 EVT DstVT = N->getValueType(0);
29753 EVT SrcVT = N->getOperand(0).getValueType();
29754
29755 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
29756 // we can split using the k-register rather than memory.
29757 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
29758 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29758, __PRETTY_FUNCTION__))
;
29759 SDValue Lo, Hi;
29760 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29761 Lo = DAG.getBitcast(MVT::i32, Lo);
29762 Hi = DAG.getBitcast(MVT::i32, Hi);
29763 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
29764 Results.push_back(Res);
29765 return;
29766 }
29767
29768 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
29769 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
29770 SrcVT.isVector() && isTypeLegal(SrcVT)) {
29771 SDValue Lo, Hi;
29772 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29773 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
29774 Lo = DAG.getBitcast(CastVT, Lo);
29775 Hi = DAG.getBitcast(CastVT, Hi);
29776 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
29777 Results.push_back(Res);
29778 return;
29779 }
29780
29781 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
29782 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29783, __PRETTY_FUNCTION__))
29783 "Unexpected type action!")((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29783, __PRETTY_FUNCTION__))
;
29784 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
29785 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
29786 Results.push_back(Res);
29787 return;
29788 }
29789
29790 return;
29791 }
29792 case ISD::MGATHER: {
29793 EVT VT = N->getValueType(0);
29794 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
29795 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
29796 auto *Gather = cast<MaskedGatherSDNode>(N);
29797 SDValue Index = Gather->getIndex();
29798 if (Index.getValueType() != MVT::v2i64)
29799 return;
29800 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29801, __PRETTY_FUNCTION__))
29801 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29801, __PRETTY_FUNCTION__))
;
29802 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
29803 SDValue Mask = Gather->getMask();
29804 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29804, __PRETTY_FUNCTION__))
;
29805 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
29806 Gather->getPassThru(),
29807 DAG.getUNDEF(VT));
29808 if (!Subtarget.hasVLX()) {
29809 // We need to widen the mask, but the instruction will only use 2
29810 // of its elements. So we can use undef.
29811 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
29812 DAG.getUNDEF(MVT::v2i1));
29813 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
29814 }
29815 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
29816 Gather->getBasePtr(), Index, Gather->getScale() };
29817 SDValue Res = DAG.getMemIntrinsicNode(
29818 X86ISD::MGATHER, dl,
29819 DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops,
29820 Gather->getMemoryVT(), Gather->getMemOperand());
29821 Results.push_back(Res);
29822 Results.push_back(Res.getValue(2));
29823 return;
29824 }
29825 return;
29826 }
29827 case ISD::LOAD: {
29828 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
29829 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
29830 // cast since type legalization will try to use an i64 load.
29831 MVT VT = N->getSimpleValueType(0);
29832 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((VT.isVector() && VT.getSizeInBits() == 64 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29832, __PRETTY_FUNCTION__))
;
29833 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29834, __PRETTY_FUNCTION__))
29834 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29834, __PRETTY_FUNCTION__))
;
29835 if (!ISD::isNON_EXTLoad(N))
29836 return;
29837 auto *Ld = cast<LoadSDNode>(N);
29838 if (Subtarget.hasSSE2()) {
29839 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
29840 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
29841 Ld->getPointerInfo(), Ld->getAlignment(),
29842 Ld->getMemOperand()->getFlags());
29843 SDValue Chain = Res.getValue(1);
29844 MVT VecVT = MVT::getVectorVT(LdVT, 2);
29845 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
29846 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
29847 Res = DAG.getBitcast(WideVT, Res);
29848 Results.push_back(Res);
29849 Results.push_back(Chain);
29850 return;
29851 }
29852 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29852, __PRETTY_FUNCTION__))
;
29853 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
29854 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
29855 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
29856 MVT::i64, Ld->getMemOperand());
29857 Results.push_back(Res);
29858 Results.push_back(Res.getValue(1));
29859 return;
29860 }
29861 case ISD::ADDRSPACECAST: {
29862 SDValue Src = N->getOperand(0);
29863 EVT DstVT = N->getValueType(0);
29864 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
29865 unsigned SrcAS = CastN->getSrcAddressSpace();
29866
29867 assert(SrcAS != CastN->getDestAddressSpace() &&((SrcAS != CastN->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != CastN->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29868, __PRETTY_FUNCTION__))
29868 "addrspacecast must be between different address spaces")((SrcAS != CastN->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != CastN->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29868, __PRETTY_FUNCTION__))
;
29869
29870 SDValue Res;
29871 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
29872 Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
29873 else if (DstVT == MVT::i64)
29874 Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
29875 else if (DstVT == MVT::i32)
29876 Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
29877 else
29878 report_fatal_error("Unrecognized addrspacecast type legalization");
29879
29880 Results.push_back(Res);
29881 return;
29882 }
29883 }
29884}
29885
29886const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
29887 switch ((X86ISD::NodeType)Opcode) {
29888 case X86ISD::FIRST_NUMBER: break;
29889#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
29890 NODE_NAME_CASE(BSF)
29891 NODE_NAME_CASE(BSR)
29892 NODE_NAME_CASE(SHLD)
29893 NODE_NAME_CASE(SHRD)
29894 NODE_NAME_CASE(FAND)
29895 NODE_NAME_CASE(FANDN)
29896 NODE_NAME_CASE(FOR)
29897 NODE_NAME_CASE(FXOR)
29898 NODE_NAME_CASE(FILD)
29899 NODE_NAME_CASE(FIST)
29900 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
29901 NODE_NAME_CASE(FLD)
29902 NODE_NAME_CASE(FST)
29903 NODE_NAME_CASE(CALL)
29904 NODE_NAME_CASE(BT)
29905 NODE_NAME_CASE(CMP)
29906 NODE_NAME_CASE(FCMP)
29907 NODE_NAME_CASE(STRICT_FCMP)
29908 NODE_NAME_CASE(STRICT_FCMPS)
29909 NODE_NAME_CASE(COMI)
29910 NODE_NAME_CASE(UCOMI)
29911 NODE_NAME_CASE(CMPM)
29912 NODE_NAME_CASE(STRICT_CMPM)
29913 NODE_NAME_CASE(CMPM_SAE)
29914 NODE_NAME_CASE(SETCC)
29915 NODE_NAME_CASE(SETCC_CARRY)
29916 NODE_NAME_CASE(FSETCC)
29917 NODE_NAME_CASE(FSETCCM)
29918 NODE_NAME_CASE(FSETCCM_SAE)
29919 NODE_NAME_CASE(CMOV)
29920 NODE_NAME_CASE(BRCOND)
29921 NODE_NAME_CASE(RET_FLAG)
29922 NODE_NAME_CASE(IRET)
29923 NODE_NAME_CASE(REP_STOS)
29924 NODE_NAME_CASE(REP_MOVS)
29925 NODE_NAME_CASE(GlobalBaseReg)
29926 NODE_NAME_CASE(Wrapper)
29927 NODE_NAME_CASE(WrapperRIP)
29928 NODE_NAME_CASE(MOVQ2DQ)
29929 NODE_NAME_CASE(MOVDQ2Q)
29930 NODE_NAME_CASE(MMX_MOVD2W)
29931 NODE_NAME_CASE(MMX_MOVW2D)
29932 NODE_NAME_CASE(PEXTRB)
29933 NODE_NAME_CASE(PEXTRW)
29934 NODE_NAME_CASE(INSERTPS)
29935 NODE_NAME_CASE(PINSRB)
29936 NODE_NAME_CASE(PINSRW)
29937 NODE_NAME_CASE(PSHUFB)
29938 NODE_NAME_CASE(ANDNP)
29939 NODE_NAME_CASE(BLENDI)
29940 NODE_NAME_CASE(BLENDV)
29941 NODE_NAME_CASE(HADD)
29942 NODE_NAME_CASE(HSUB)
29943 NODE_NAME_CASE(FHADD)
29944 NODE_NAME_CASE(FHSUB)
29945 NODE_NAME_CASE(CONFLICT)
29946 NODE_NAME_CASE(FMAX)
29947 NODE_NAME_CASE(FMAXS)
29948 NODE_NAME_CASE(FMAX_SAE)
29949 NODE_NAME_CASE(FMAXS_SAE)
29950 NODE_NAME_CASE(FMIN)
29951 NODE_NAME_CASE(FMINS)
29952 NODE_NAME_CASE(FMIN_SAE)
29953 NODE_NAME_CASE(FMINS_SAE)
29954 NODE_NAME_CASE(FMAXC)
29955 NODE_NAME_CASE(FMINC)
29956 NODE_NAME_CASE(FRSQRT)
29957 NODE_NAME_CASE(FRCP)
29958 NODE_NAME_CASE(EXTRQI)
29959 NODE_NAME_CASE(INSERTQI)
29960 NODE_NAME_CASE(TLSADDR)
29961 NODE_NAME_CASE(TLSBASEADDR)
29962 NODE_NAME_CASE(TLSCALL)
29963 NODE_NAME_CASE(EH_SJLJ_SETJMP)
29964 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
29965 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
29966 NODE_NAME_CASE(EH_RETURN)
29967 NODE_NAME_CASE(TC_RETURN)
29968 NODE_NAME_CASE(FNSTCW16m)
29969 NODE_NAME_CASE(LCMPXCHG_DAG)
29970 NODE_NAME_CASE(LCMPXCHG8_DAG)
29971 NODE_NAME_CASE(LCMPXCHG16_DAG)
29972 NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
29973 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
29974 NODE_NAME_CASE(LADD)
29975 NODE_NAME_CASE(LSUB)
29976 NODE_NAME_CASE(LOR)
29977 NODE_NAME_CASE(LXOR)
29978 NODE_NAME_CASE(LAND)
29979 NODE_NAME_CASE(VZEXT_MOVL)
29980 NODE_NAME_CASE(VZEXT_LOAD)
29981 NODE_NAME_CASE(VEXTRACT_STORE)
29982 NODE_NAME_CASE(VTRUNC)
29983 NODE_NAME_CASE(VTRUNCS)
29984 NODE_NAME_CASE(VTRUNCUS)
29985 NODE_NAME_CASE(VMTRUNC)
29986 NODE_NAME_CASE(VMTRUNCS)
29987 NODE_NAME_CASE(VMTRUNCUS)
29988 NODE_NAME_CASE(VTRUNCSTORES)
29989 NODE_NAME_CASE(VTRUNCSTOREUS)
29990 NODE_NAME_CASE(VMTRUNCSTORES)
29991 NODE_NAME_CASE(VMTRUNCSTOREUS)
29992 NODE_NAME_CASE(VFPEXT)
29993 NODE_NAME_CASE(STRICT_VFPEXT)
29994 NODE_NAME_CASE(VFPEXT_SAE)
29995 NODE_NAME_CASE(VFPEXTS)
29996 NODE_NAME_CASE(VFPEXTS_SAE)
29997 NODE_NAME_CASE(VFPROUND)
29998 NODE_NAME_CASE(STRICT_VFPROUND)
29999 NODE_NAME_CASE(VMFPROUND)
30000 NODE_NAME_CASE(VFPROUND_RND)
30001 NODE_NAME_CASE(VFPROUNDS)
30002 NODE_NAME_CASE(VFPROUNDS_RND)
30003 NODE_NAME_CASE(VSHLDQ)
30004 NODE_NAME_CASE(VSRLDQ)
30005 NODE_NAME_CASE(VSHL)
30006 NODE_NAME_CASE(VSRL)
30007 NODE_NAME_CASE(VSRA)
30008 NODE_NAME_CASE(VSHLI)
30009 NODE_NAME_CASE(VSRLI)
30010 NODE_NAME_CASE(VSRAI)
30011 NODE_NAME_CASE(VSHLV)
30012 NODE_NAME_CASE(VSRLV)
30013 NODE_NAME_CASE(VSRAV)
30014 NODE_NAME_CASE(VROTLI)
30015 NODE_NAME_CASE(VROTRI)
30016 NODE_NAME_CASE(VPPERM)
30017 NODE_NAME_CASE(CMPP)
30018 NODE_NAME_CASE(STRICT_CMPP)
30019 NODE_NAME_CASE(PCMPEQ)
30020 NODE_NAME_CASE(PCMPGT)
30021 NODE_NAME_CASE(PHMINPOS)
30022 NODE_NAME_CASE(ADD)
30023 NODE_NAME_CASE(SUB)
30024 NODE_NAME_CASE(ADC)
30025 NODE_NAME_CASE(SBB)
30026 NODE_NAME_CASE(SMUL)
30027 NODE_NAME_CASE(UMUL)
30028 NODE_NAME_CASE(OR)
30029 NODE_NAME_CASE(XOR)
30030 NODE_NAME_CASE(AND)
30031 NODE_NAME_CASE(BEXTR)
30032 NODE_NAME_CASE(BZHI)
30033 NODE_NAME_CASE(MUL_IMM)
30034 NODE_NAME_CASE(MOVMSK)
30035 NODE_NAME_CASE(PTEST)
30036 NODE_NAME_CASE(TESTP)
30037 NODE_NAME_CASE(KORTEST)
30038 NODE_NAME_CASE(KTEST)
30039 NODE_NAME_CASE(KADD)
30040 NODE_NAME_CASE(KSHIFTL)
30041 NODE_NAME_CASE(KSHIFTR)
30042 NODE_NAME_CASE(PACKSS)
30043 NODE_NAME_CASE(PACKUS)
30044 NODE_NAME_CASE(PALIGNR)
30045 NODE_NAME_CASE(VALIGN)
30046 NODE_NAME_CASE(VSHLD)
30047 NODE_NAME_CASE(VSHRD)
30048 NODE_NAME_CASE(VSHLDV)
30049 NODE_NAME_CASE(VSHRDV)
30050 NODE_NAME_CASE(PSHUFD)
30051 NODE_NAME_CASE(PSHUFHW)
30052 NODE_NAME_CASE(PSHUFLW)
30053 NODE_NAME_CASE(SHUFP)
30054 NODE_NAME_CASE(SHUF128)
30055 NODE_NAME_CASE(MOVLHPS)
30056 NODE_NAME_CASE(MOVHLPS)
30057 NODE_NAME_CASE(MOVDDUP)
30058 NODE_NAME_CASE(MOVSHDUP)
30059 NODE_NAME_CASE(MOVSLDUP)
30060 NODE_NAME_CASE(MOVSD)
30061 NODE_NAME_CASE(MOVSS)
30062 NODE_NAME_CASE(UNPCKL)
30063 NODE_NAME_CASE(UNPCKH)
30064 NODE_NAME_CASE(VBROADCAST)
30065 NODE_NAME_CASE(VBROADCAST_LOAD)
30066 NODE_NAME_CASE(VBROADCASTM)
30067 NODE_NAME_CASE(SUBV_BROADCAST)
30068 NODE_NAME_CASE(VPERMILPV)
30069 NODE_NAME_CASE(VPERMILPI)
30070 NODE_NAME_CASE(VPERM2X128)
30071 NODE_NAME_CASE(VPERMV)
30072 NODE_NAME_CASE(VPERMV3)
30073 NODE_NAME_CASE(VPERMI)
30074 NODE_NAME_CASE(VPTERNLOG)
30075 NODE_NAME_CASE(VFIXUPIMM)
30076 NODE_NAME_CASE(VFIXUPIMM_SAE)
30077 NODE_NAME_CASE(VFIXUPIMMS)
30078 NODE_NAME_CASE(VFIXUPIMMS_SAE)
30079 NODE_NAME_CASE(VRANGE)
30080 NODE_NAME_CASE(VRANGE_SAE)
30081 NODE_NAME_CASE(VRANGES)
30082 NODE_NAME_CASE(VRANGES_SAE)
30083 NODE_NAME_CASE(PMULUDQ)
30084 NODE_NAME_CASE(PMULDQ)
30085 NODE_NAME_CASE(PSADBW)
30086 NODE_NAME_CASE(DBPSADBW)
30087 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
30088 NODE_NAME_CASE(VAARG_64)
30089 NODE_NAME_CASE(WIN_ALLOCA)
30090 NODE_NAME_CASE(MEMBARRIER)
30091 NODE_NAME_CASE(MFENCE)
30092 NODE_NAME_CASE(SEG_ALLOCA)
30093 NODE_NAME_CASE(PROBED_ALLOCA)
30094 NODE_NAME_CASE(RDRAND)
30095 NODE_NAME_CASE(RDSEED)
30096 NODE_NAME_CASE(RDPKRU)
30097 NODE_NAME_CASE(WRPKRU)
30098 NODE_NAME_CASE(VPMADDUBSW)
30099 NODE_NAME_CASE(VPMADDWD)
30100 NODE_NAME_CASE(VPSHA)
30101 NODE_NAME_CASE(VPSHL)
30102 NODE_NAME_CASE(VPCOM)
30103 NODE_NAME_CASE(VPCOMU)
30104 NODE_NAME_CASE(VPERMIL2)
30105 NODE_NAME_CASE(FMSUB)
30106 NODE_NAME_CASE(STRICT_FMSUB)
30107 NODE_NAME_CASE(FNMADD)
30108 NODE_NAME_CASE(STRICT_FNMADD)
30109 NODE_NAME_CASE(FNMSUB)
30110 NODE_NAME_CASE(STRICT_FNMSUB)
30111 NODE_NAME_CASE(FMADDSUB)
30112 NODE_NAME_CASE(FMSUBADD)
30113 NODE_NAME_CASE(FMADD_RND)
30114 NODE_NAME_CASE(FNMADD_RND)
30115 NODE_NAME_CASE(FMSUB_RND)
30116 NODE_NAME_CASE(FNMSUB_RND)
30117 NODE_NAME_CASE(FMADDSUB_RND)
30118 NODE_NAME_CASE(FMSUBADD_RND)
30119 NODE_NAME_CASE(VPMADD52H)
30120 NODE_NAME_CASE(VPMADD52L)
30121 NODE_NAME_CASE(VRNDSCALE)
30122 NODE_NAME_CASE(STRICT_VRNDSCALE)
30123 NODE_NAME_CASE(VRNDSCALE_SAE)
30124 NODE_NAME_CASE(VRNDSCALES)
30125 NODE_NAME_CASE(VRNDSCALES_SAE)
30126 NODE_NAME_CASE(VREDUCE)
30127 NODE_NAME_CASE(VREDUCE_SAE)
30128 NODE_NAME_CASE(VREDUCES)
30129 NODE_NAME_CASE(VREDUCES_SAE)
30130 NODE_NAME_CASE(VGETMANT)
30131 NODE_NAME_CASE(VGETMANT_SAE)
30132 NODE_NAME_CASE(VGETMANTS)
30133 NODE_NAME_CASE(VGETMANTS_SAE)
30134 NODE_NAME_CASE(PCMPESTR)
30135 NODE_NAME_CASE(PCMPISTR)
30136 NODE_NAME_CASE(XTEST)
30137 NODE_NAME_CASE(COMPRESS)
30138 NODE_NAME_CASE(EXPAND)
30139 NODE_NAME_CASE(SELECTS)
30140 NODE_NAME_CASE(ADDSUB)
30141 NODE_NAME_CASE(RCP14)
30142 NODE_NAME_CASE(RCP14S)
30143 NODE_NAME_CASE(RCP28)
30144 NODE_NAME_CASE(RCP28_SAE)
30145 NODE_NAME_CASE(RCP28S)
30146 NODE_NAME_CASE(RCP28S_SAE)
30147 NODE_NAME_CASE(EXP2)
30148 NODE_NAME_CASE(EXP2_SAE)
30149 NODE_NAME_CASE(RSQRT14)
30150 NODE_NAME_CASE(RSQRT14S)
30151 NODE_NAME_CASE(RSQRT28)
30152 NODE_NAME_CASE(RSQRT28_SAE)
30153 NODE_NAME_CASE(RSQRT28S)
30154 NODE_NAME_CASE(RSQRT28S_SAE)
30155 NODE_NAME_CASE(FADD_RND)
30156 NODE_NAME_CASE(FADDS)
30157 NODE_NAME_CASE(FADDS_RND)
30158 NODE_NAME_CASE(FSUB_RND)
30159 NODE_NAME_CASE(FSUBS)
30160 NODE_NAME_CASE(FSUBS_RND)
30161 NODE_NAME_CASE(FMUL_RND)
30162 NODE_NAME_CASE(FMULS)
30163 NODE_NAME_CASE(FMULS_RND)
30164 NODE_NAME_CASE(FDIV_RND)
30165 NODE_NAME_CASE(FDIVS)
30166 NODE_NAME_CASE(FDIVS_RND)
30167 NODE_NAME_CASE(FSQRT_RND)
30168 NODE_NAME_CASE(FSQRTS)
30169 NODE_NAME_CASE(FSQRTS_RND)
30170 NODE_NAME_CASE(FGETEXP)
30171 NODE_NAME_CASE(FGETEXP_SAE)
30172 NODE_NAME_CASE(FGETEXPS)
30173 NODE_NAME_CASE(FGETEXPS_SAE)
30174 NODE_NAME_CASE(SCALEF)
30175 NODE_NAME_CASE(SCALEF_RND)
30176 NODE_NAME_CASE(SCALEFS)
30177 NODE_NAME_CASE(SCALEFS_RND)
30178 NODE_NAME_CASE(AVG)
30179 NODE_NAME_CASE(MULHRS)
30180 NODE_NAME_CASE(SINT_TO_FP_RND)
30181 NODE_NAME_CASE(UINT_TO_FP_RND)
30182 NODE_NAME_CASE(CVTTP2SI)
30183 NODE_NAME_CASE(CVTTP2UI)
30184 NODE_NAME_CASE(STRICT_CVTTP2SI)
30185 NODE_NAME_CASE(STRICT_CVTTP2UI)
30186 NODE_NAME_CASE(MCVTTP2SI)
30187 NODE_NAME_CASE(MCVTTP2UI)
30188 NODE_NAME_CASE(CVTTP2SI_SAE)
30189 NODE_NAME_CASE(CVTTP2UI_SAE)
30190 NODE_NAME_CASE(CVTTS2SI)
30191 NODE_NAME_CASE(CVTTS2UI)
30192 NODE_NAME_CASE(CVTTS2SI_SAE)
30193 NODE_NAME_CASE(CVTTS2UI_SAE)
30194 NODE_NAME_CASE(CVTSI2P)
30195 NODE_NAME_CASE(CVTUI2P)
30196 NODE_NAME_CASE(STRICT_CVTSI2P)
30197 NODE_NAME_CASE(STRICT_CVTUI2P)
30198 NODE_NAME_CASE(MCVTSI2P)
30199 NODE_NAME_CASE(MCVTUI2P)
30200 NODE_NAME_CASE(VFPCLASS)
30201 NODE_NAME_CASE(VFPCLASSS)
30202 NODE_NAME_CASE(MULTISHIFT)
30203 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
30204 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
30205 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
30206 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
30207 NODE_NAME_CASE(CVTPS2PH)
30208 NODE_NAME_CASE(STRICT_CVTPS2PH)
30209 NODE_NAME_CASE(MCVTPS2PH)
30210 NODE_NAME_CASE(CVTPH2PS)
30211 NODE_NAME_CASE(STRICT_CVTPH2PS)
30212 NODE_NAME_CASE(CVTPH2PS_SAE)
30213 NODE_NAME_CASE(CVTP2SI)
30214 NODE_NAME_CASE(CVTP2UI)
30215 NODE_NAME_CASE(MCVTP2SI)
30216 NODE_NAME_CASE(MCVTP2UI)
30217 NODE_NAME_CASE(CVTP2SI_RND)
30218 NODE_NAME_CASE(CVTP2UI_RND)
30219 NODE_NAME_CASE(CVTS2SI)
30220 NODE_NAME_CASE(CVTS2UI)
30221 NODE_NAME_CASE(CVTS2SI_RND)
30222 NODE_NAME_CASE(CVTS2UI_RND)
30223 NODE_NAME_CASE(CVTNE2PS2BF16)
30224 NODE_NAME_CASE(CVTNEPS2BF16)
30225 NODE_NAME_CASE(MCVTNEPS2BF16)
30226 NODE_NAME_CASE(DPBF16PS)
30227 NODE_NAME_CASE(LWPINS)
30228 NODE_NAME_CASE(MGATHER)
30229 NODE_NAME_CASE(MSCATTER)
30230 NODE_NAME_CASE(VPDPBUSD)
30231 NODE_NAME_CASE(VPDPBUSDS)
30232 NODE_NAME_CASE(VPDPWSSD)
30233 NODE_NAME_CASE(VPDPWSSDS)
30234 NODE_NAME_CASE(VPSHUFBITQMB)
30235 NODE_NAME_CASE(GF2P8MULB)
30236 NODE_NAME_CASE(GF2P8AFFINEQB)
30237 NODE_NAME_CASE(GF2P8AFFINEINVQB)
30238 NODE_NAME_CASE(NT_CALL)
30239 NODE_NAME_CASE(NT_BRIND)
30240 NODE_NAME_CASE(UMWAIT)
30241 NODE_NAME_CASE(TPAUSE)
30242 NODE_NAME_CASE(ENQCMD)
30243 NODE_NAME_CASE(ENQCMDS)
30244 NODE_NAME_CASE(VP2INTERSECT)
30245 }
30246 return nullptr;
30247#undef NODE_NAME_CASE
30248}
30249
30250/// Return true if the addressing mode represented by AM is legal for this
30251/// target, for a load/store of the specified type.
30252bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
30253 const AddrMode &AM, Type *Ty,
30254 unsigned AS,
30255 Instruction *I) const {
30256 // X86 supports extremely general addressing modes.
30257 CodeModel::Model M = getTargetMachine().getCodeModel();
30258
30259 // X86 allows a sign-extended 32-bit immediate field as a displacement.
30260 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
30261 return false;
30262
30263 if (AM.BaseGV) {
30264 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
30265
30266 // If a reference to this global requires an extra load, we can't fold it.
30267 if (isGlobalStubReference(GVFlags))
30268 return false;
30269
30270 // If BaseGV requires a register for the PIC base, we cannot also have a
30271 // BaseReg specified.
30272 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
30273 return false;
30274
30275 // If lower 4G is not available, then we must use rip-relative addressing.
30276 if ((M != CodeModel::Small || isPositionIndependent()) &&
30277 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
30278 return false;
30279 }
30280
30281 switch (AM.Scale) {
30282 case 0:
30283 case 1:
30284 case 2:
30285 case 4:
30286 case 8:
30287 // These scales always work.
30288 break;
30289 case 3:
30290 case 5:
30291 case 9:
30292 // These scales are formed with basereg+scalereg. Only accept if there is
30293 // no basereg yet.
30294 if (AM.HasBaseReg)
30295 return false;
30296 break;
30297 default: // Other stuff never works.
30298 return false;
30299 }
30300
30301 return true;
30302}
30303
30304bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
30305 unsigned Bits = Ty->getScalarSizeInBits();
30306
30307 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
30308 // particularly cheaper than those without.
30309 if (Bits == 8)
30310 return false;
30311
30312 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
30313 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
30314 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
30315 return false;
30316
30317 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
30318 // shifts just as cheap as scalar ones.
30319 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
30320 return false;
30321
30322 // AVX512BW has shifts such as vpsllvw.
30323 if (Subtarget.hasBWI() && Bits == 16)
30324 return false;
30325
30326 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
30327 // fully general vector.
30328 return true;
30329}
30330
30331bool X86TargetLowering::isBinOp(unsigned Opcode) const {
30332 switch (Opcode) {
30333 // These are non-commutative binops.
30334 // TODO: Add more X86ISD opcodes once we have test coverage.
30335 case X86ISD::ANDNP:
30336 case X86ISD::PCMPGT:
30337 case X86ISD::FMAX:
30338 case X86ISD::FMIN:
30339 case X86ISD::FANDN:
30340 return true;
30341 }
30342
30343 return TargetLoweringBase::isBinOp(Opcode);
30344}
30345
30346bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
30347 switch (Opcode) {
30348 // TODO: Add more X86ISD opcodes once we have test coverage.
30349 case X86ISD::PCMPEQ:
30350 case X86ISD::PMULDQ:
30351 case X86ISD::PMULUDQ:
30352 case X86ISD::FMAXC:
30353 case X86ISD::FMINC:
30354 case X86ISD::FAND:
30355 case X86ISD::FOR:
30356 case X86ISD::FXOR:
30357 return true;
30358 }
30359
30360 return TargetLoweringBase::isCommutativeBinOp(Opcode);
30361}
30362
30363bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
30364 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
30365 return false;
30366 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
30367 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
30368 return NumBits1 > NumBits2;
30369}
30370
30371bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
30372 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
30373 return false;
30374
30375 if (!isTypeLegal(EVT::getEVT(Ty1)))
30376 return false;
30377
30378 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30378, __PRETTY_FUNCTION__))
;
30379
30380 // Assuming the caller doesn't have a zeroext or signext return parameter,
30381 // truncation all the way down to i1 is valid.
30382 return true;
30383}
30384
30385bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
30386 return isInt<32>(Imm);
30387}
30388
30389bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
30390 // Can also use sub to handle negated immediates.
30391 return isInt<32>(Imm);
30392}
30393
30394bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
30395 return isInt<32>(Imm);
30396}
30397
30398bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
30399 if (!VT1.isInteger() || !VT2.isInteger())
30400 return false;
30401 unsigned NumBits1 = VT1.getSizeInBits();
30402 unsigned NumBits2 = VT2.getSizeInBits();
30403 return NumBits1 > NumBits2;
30404}
30405
30406bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
30407 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
30408 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
30409}
30410
30411bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
30412 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
30413 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
30414}
30415
30416bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
30417 EVT VT1 = Val.getValueType();
30418 if (isZExtFree(VT1, VT2))
30419 return true;
30420
30421 if (Val.getOpcode() != ISD::LOAD)
30422 return false;
30423
30424 if (!VT1.isSimple() || !VT1.isInteger() ||
30425 !VT2.isSimple() || !VT2.isInteger())
30426 return false;
30427
30428 switch (VT1.getSimpleVT().SimpleTy) {
30429 default: break;
30430 case MVT::i8:
30431 case MVT::i16:
30432 case MVT::i32:
30433 // X86 has 8, 16, and 32-bit zero-extending loads.
30434 return true;
30435 }
30436
30437 return false;
30438}
30439
30440bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
30441 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
30442 return false;
30443
30444 EVT SrcVT = ExtVal.getOperand(0).getValueType();
30445
30446 // There is no extending load for vXi1.
30447 if (SrcVT.getScalarType() == MVT::i1)
30448 return false;
30449
30450 return true;
30451}
30452
30453bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
30454 EVT VT) const {
30455 if (!Subtarget.hasAnyFMA())
30456 return false;
30457
30458 VT = VT.getScalarType();
30459
30460 if (!VT.isSimple())
30461 return false;
30462
30463 switch (VT.getSimpleVT().SimpleTy) {
30464 case MVT::f32:
30465 case MVT::f64:
30466 return true;
30467 default:
30468 break;
30469 }
30470
30471 return false;
30472}
30473
30474bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
30475 // i16 instructions are longer (0x66 prefix) and potentially slower.
30476 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
30477}
30478
30479/// Targets can use this to indicate that they only support *some*
30480/// VECTOR_SHUFFLE operations, those with specific masks.
30481/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
30482/// are assumed to be legal.
30483bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
30484 if (!VT.isSimple())
30485 return false;
30486
30487 // Not for i1 vectors
30488 if (VT.getSimpleVT().getScalarType() == MVT::i1)
30489 return false;
30490
30491 // Very little shuffling can be done for 64-bit vectors right now.
30492 if (VT.getSimpleVT().getSizeInBits() == 64)
30493 return false;
30494
30495 // We only care that the types being shuffled are legal. The lowering can
30496 // handle any possible shuffle mask that results.
30497 return isTypeLegal(VT.getSimpleVT());
30498}
30499
30500bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
30501 EVT VT) const {
30502 // Don't convert an 'and' into a shuffle that we don't directly support.
30503 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
30504 if (!Subtarget.hasAVX2())
30505 if (VT == MVT::v32i8 || VT == MVT::v16i16)
30506 return false;
30507
30508 // Just delegate to the generic legality, clear masks aren't special.
30509 return isShuffleMaskLegal(Mask, VT);
30510}
30511
30512bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
30513 // If the subtarget is using retpolines, we need to not generate jump tables.
30514 if (Subtarget.useRetpolineIndirectBranches())
30515 return false;
30516
30517 // Otherwise, fallback on the generic logic.
30518 return TargetLowering::areJTsAllowed(Fn);
30519}
30520
30521//===----------------------------------------------------------------------===//
30522// X86 Scheduler Hooks
30523//===----------------------------------------------------------------------===//
30524
30525/// Utility function to emit xbegin specifying the start of an RTM region.
30526static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
30527 const TargetInstrInfo *TII) {
30528 DebugLoc DL = MI.getDebugLoc();
30529
30530 const BasicBlock *BB = MBB->getBasicBlock();
30531 MachineFunction::iterator I = ++MBB->getIterator();
30532
30533 // For the v = xbegin(), we generate
30534 //
30535 // thisMBB:
30536 // xbegin sinkMBB
30537 //
30538 // mainMBB:
30539 // s0 = -1
30540 //
30541 // fallBB:
30542 // eax = # XABORT_DEF
30543 // s1 = eax
30544 //
30545 // sinkMBB:
30546 // v = phi(s0/mainBB, s1/fallBB)
30547
30548 MachineBasicBlock *thisMBB = MBB;
30549 MachineFunction *MF = MBB->getParent();
30550 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
30551 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
30552 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
30553 MF->insert(I, mainMBB);
30554 MF->insert(I, fallMBB);
30555 MF->insert(I, sinkMBB);
30556
30557 // Transfer the remainder of BB and its successor edges to sinkMBB.
30558 sinkMBB->splice(sinkMBB->begin(), MBB,
30559 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
30560 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
30561
30562 MachineRegisterInfo &MRI = MF->getRegInfo();
30563 Register DstReg = MI.getOperand(0).getReg();
30564 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
30565 Register mainDstReg = MRI.createVirtualRegister(RC);
30566 Register fallDstReg = MRI.createVirtualRegister(RC);
30567
30568 // thisMBB:
30569 // xbegin fallMBB
30570 // # fallthrough to mainMBB
30571 // # abortion to fallMBB
30572 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
30573 thisMBB->addSuccessor(mainMBB);
30574 thisMBB->addSuccessor(fallMBB);
30575
30576 // mainMBB:
30577 // mainDstReg := -1
30578 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
30579 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
30580 mainMBB->addSuccessor(sinkMBB);
30581
30582 // fallMBB:
30583 // ; pseudo instruction to model hardware's definition from XABORT
30584 // EAX := XABORT_DEF
30585 // fallDstReg := EAX
30586 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
30587 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
30588 .addReg(X86::EAX);
30589 fallMBB->addSuccessor(sinkMBB);
30590
30591 // sinkMBB:
30592 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
30593 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
30594 .addReg(mainDstReg).addMBB(mainMBB)
30595 .addReg(fallDstReg).addMBB(fallMBB);
30596
30597 MI.eraseFromParent();
30598 return sinkMBB;
30599}
30600
30601
30602
30603MachineBasicBlock *
30604X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
30605 MachineBasicBlock *MBB) const {
30606 // Emit va_arg instruction on X86-64.
30607
30608 // Operands to this pseudo-instruction:
30609 // 0 ) Output : destination address (reg)
30610 // 1-5) Input : va_list address (addr, i64mem)
30611 // 6 ) ArgSize : Size (in bytes) of vararg type
30612 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
30613 // 8 ) Align : Alignment of type
30614 // 9 ) EFLAGS (implicit-def)
30615
30616 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30616, __PRETTY_FUNCTION__))
;
30617 static_assert(X86::AddrNumOperands == 5,
30618 "VAARG_64 assumes 5 address operands");
30619
30620 Register DestReg = MI.getOperand(0).getReg();
30621 MachineOperand &Base = MI.getOperand(1);
30622 MachineOperand &Scale = MI.getOperand(2);
30623 MachineOperand &Index = MI.getOperand(3);
30624 MachineOperand &Disp = MI.getOperand(4);
30625 MachineOperand &Segment = MI.getOperand(5);
30626 unsigned ArgSize = MI.getOperand(6).getImm();
30627 unsigned ArgMode = MI.getOperand(7).getImm();
30628 unsigned Align = MI.getOperand(8).getImm();
30629
30630 MachineFunction *MF = MBB->getParent();
30631
30632 // Memory Reference
30633 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30633, __PRETTY_FUNCTION__))
;
30634
30635 MachineMemOperand *OldMMO = MI.memoperands().front();
30636
30637 // Clone the MMO into two separate MMOs for loading and storing
30638 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
30639 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
30640 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
30641 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
30642
30643 // Machine Information
30644 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30645 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
30646 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
30647 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
30648 DebugLoc DL = MI.getDebugLoc();
30649
30650 // struct va_list {
30651 // i32 gp_offset
30652 // i32 fp_offset
30653 // i64 overflow_area (address)
30654 // i64 reg_save_area (address)
30655 // }
30656 // sizeof(va_list) = 24
30657 // alignment(va_list) = 8
30658
30659 unsigned TotalNumIntRegs = 6;
30660 unsigned TotalNumXMMRegs = 8;
30661 bool UseGPOffset = (ArgMode == 1);
30662 bool UseFPOffset = (ArgMode == 2);
30663 unsigned MaxOffset = TotalNumIntRegs * 8 +
30664 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
30665
30666 /* Align ArgSize to a multiple of 8 */
30667 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
30668 bool NeedsAlign = (Align > 8);
30669
30670 MachineBasicBlock *thisMBB = MBB;
30671 MachineBasicBlock *overflowMBB;
30672 MachineBasicBlock *offsetMBB;
30673 MachineBasicBlock *endMBB;
30674
30675 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
30676 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
30677 unsigned OffsetReg = 0;
30678
30679 if (!UseGPOffset && !UseFPOffset) {
30680 // If we only pull from the overflow region, we don't create a branch.
30681 // We don't need to alter control flow.
30682 OffsetDestReg = 0; // unused
30683 OverflowDestReg = DestReg;
30684
30685 offsetMBB = nullptr;
30686 overflowMBB = thisMBB;
30687 endMBB = thisMBB;
30688 } else {
30689 // First emit code to check if gp_offset (or fp_offset) is below the bound.
30690 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
30691 // If not, pull from overflow_area. (branch to overflowMBB)
30692 //
30693 // thisMBB
30694 // | .
30695 // | .
30696 // offsetMBB overflowMBB
30697 // | .
30698 // | .
30699 // endMBB
30700
30701 // Registers for the PHI in endMBB
30702 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
30703 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
30704
30705 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
30706 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30707 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30708 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30709
30710 MachineFunction::iterator MBBIter = ++MBB->getIterator();
30711
30712 // Insert the new basic blocks
30713 MF->insert(MBBIter, offsetMBB);
30714 MF->insert(MBBIter, overflowMBB);
30715 MF->insert(MBBIter, endMBB);
30716
30717 // Transfer the remainder of MBB and its successor edges to endMBB.
30718 endMBB->splice(endMBB->begin(), thisMBB,
30719 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
30720 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
30721
30722 // Make offsetMBB and overflowMBB successors of thisMBB
30723 thisMBB->addSuccessor(offsetMBB);
30724 thisMBB->addSuccessor(overflowMBB);
30725
30726 // endMBB is a successor of both offsetMBB and overflowMBB
30727 offsetMBB->addSuccessor(endMBB);
30728 overflowMBB->addSuccessor(endMBB);
30729
30730 // Load the offset value into a register
30731 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
30732 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
30733 .add(Base)
30734 .add(Scale)
30735 .add(Index)
30736 .addDisp(Disp, UseFPOffset ? 4 : 0)
30737 .add(Segment)
30738 .setMemRefs(LoadOnlyMMO);
30739
30740 // Check if there is enough room left to pull this argument.
30741 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
30742 .addReg(OffsetReg)
30743 .addImm(MaxOffset + 8 - ArgSizeA8);
30744
30745 // Branch to "overflowMBB" if offset >= max
30746 // Fall through to "offsetMBB" otherwise
30747 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
30748 .addMBB(overflowMBB).addImm(X86::COND_AE);
30749 }
30750
30751 // In offsetMBB, emit code to use the reg_save_area.
30752 if (offsetMBB) {
30753 assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30753, __PRETTY_FUNCTION__))
;
30754
30755 // Read the reg_save_area address.
30756 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
30757 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
30758 .add(Base)
30759 .add(Scale)
30760 .add(Index)
30761 .addDisp(Disp, 16)
30762 .add(Segment)
30763 .setMemRefs(LoadOnlyMMO);
30764
30765 // Zero-extend the offset
30766 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
30767 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
30768 .addImm(0)
30769 .addReg(OffsetReg)
30770 .addImm(X86::sub_32bit);
30771
30772 // Add the offset to the reg_save_area to get the final address.
30773 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
30774 .addReg(OffsetReg64)
30775 .addReg(RegSaveReg);
30776
30777 // Compute the offset for the next argument
30778 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
30779 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
30780 .addReg(OffsetReg)
30781 .addImm(UseFPOffset ? 16 : 8);
30782
30783 // Store it back into the va_list.
30784 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
30785 .add(Base)
30786 .add(Scale)
30787 .add(Index)
30788 .addDisp(Disp, UseFPOffset ? 4 : 0)
30789 .add(Segment)
30790 .addReg(NextOffsetReg)
30791 .setMemRefs(StoreOnlyMMO);
30792
30793 // Jump to endMBB
30794 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
30795 .addMBB(endMBB);
30796 }
30797
30798 //
30799 // Emit code to use overflow area
30800 //
30801
30802 // Load the overflow_area address into a register.
30803 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
30804 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
30805 .add(Base)
30806 .add(Scale)
30807 .add(Index)
30808 .addDisp(Disp, 8)
30809 .add(Segment)
30810 .setMemRefs(LoadOnlyMMO);
30811
30812 // If we need to align it, do so. Otherwise, just copy the address
30813 // to OverflowDestReg.
30814 if (NeedsAlign) {
30815 // Align the overflow address
30816 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")((isPowerOf2_32(Align) && "Alignment must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30816, __PRETTY_FUNCTION__))
;
30817 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
30818
30819 // aligned_addr = (addr + (align-1)) & ~(align-1)
30820 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
30821 .addReg(OverflowAddrReg)
30822 .addImm(Align-1);
30823
30824 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
30825 .addReg(TmpReg)
30826 .addImm(~(uint64_t)(Align-1));
30827 } else {
30828 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
30829 .addReg(OverflowAddrReg);
30830 }
30831
30832 // Compute the next overflow address after this argument.
30833 // (the overflow address should be kept 8-byte aligned)
30834 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
30835 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
30836 .addReg(OverflowDestReg)
30837 .addImm(ArgSizeA8);
30838
30839 // Store the new overflow address.
30840 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
30841 .add(Base)
30842 .add(Scale)
30843 .add(Index)
30844 .addDisp(Disp, 8)
30845 .add(Segment)
30846 .addReg(NextAddrReg)
30847 .setMemRefs(StoreOnlyMMO);
30848
30849 // If we branched, emit the PHI to the front of endMBB.
30850 if (offsetMBB) {
30851 BuildMI(*endMBB, endMBB->begin(), DL,
30852 TII->get(X86::PHI), DestReg)
30853 .addReg(OffsetDestReg).addMBB(offsetMBB)
30854 .addReg(OverflowDestReg).addMBB(overflowMBB);
30855 }
30856
30857 // Erase the pseudo instruction
30858 MI.eraseFromParent();
30859
30860 return endMBB;
30861}
30862
30863MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
30864 MachineInstr &MI, MachineBasicBlock *MBB) const {
30865 // Emit code to save XMM registers to the stack. The ABI says that the
30866 // number of registers to save is given in %al, so it's theoretically
30867 // possible to do an indirect jump trick to avoid saving all of them,
30868 // however this code takes a simpler approach and just executes all
30869 // of the stores if %al is non-zero. It's less code, and it's probably
30870 // easier on the hardware branch predictor, and stores aren't all that
30871 // expensive anyway.
30872
30873 // Create the new basic blocks. One block contains all the XMM stores,
30874 // and one block is the final destination regardless of whether any
30875 // stores were performed.
30876 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
30877 MachineFunction *F = MBB->getParent();
30878 MachineFunction::iterator MBBIter = ++MBB->getIterator();
30879 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
30880 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
30881 F->insert(MBBIter, XMMSaveMBB);
30882 F->insert(MBBIter, EndMBB);
30883
30884 // Transfer the remainder of MBB and its successor edges to EndMBB.
30885 EndMBB->splice(EndMBB->begin(), MBB,
30886 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
30887 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
30888
30889 // The original block will now fall through to the XMM save block.
30890 MBB->addSuccessor(XMMSaveMBB);
30891 // The XMMSaveMBB will fall through to the end block.
30892 XMMSaveMBB->addSuccessor(EndMBB);
30893
30894 // Now add the instructions.
30895 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30896 DebugLoc DL = MI.getDebugLoc();
30897
30898 Register CountReg = MI.getOperand(0).getReg();
30899 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
30900 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
30901
30902 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
30903 // If %al is 0, branch around the XMM save block.
30904 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
30905 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
30906 MBB->addSuccessor(EndMBB);
30907 }
30908
30909 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
30910 // that was just emitted, but clearly shouldn't be "saved".
30911 assert((MI.getNumOperands() <= 3 ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30914, __PRETTY_FUNCTION__))
30912 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30914, __PRETTY_FUNCTION__))
30913 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30914, __PRETTY_FUNCTION__))
30914 "Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30914, __PRETTY_FUNCTION__))
;
30915 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
30916 // In the XMM save block, save all the XMM argument registers.
30917 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
30918 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
30919 MachineMemOperand *MMO = F->getMachineMemOperand(
30920 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
30921 MachineMemOperand::MOStore,
30922 /*Size=*/16, /*Align=*/16);
30923 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
30924 .addFrameIndex(RegSaveFrameIndex)
30925 .addImm(/*Scale=*/1)
30926 .addReg(/*IndexReg=*/0)
30927 .addImm(/*Disp=*/Offset)
30928 .addReg(/*Segment=*/0)
30929 .addReg(MI.getOperand(i).getReg())
30930 .addMemOperand(MMO);
30931 }
30932
30933 MI.eraseFromParent(); // The pseudo instruction is gone now.
30934
30935 return EndMBB;
30936}
30937
30938// The EFLAGS operand of SelectItr might be missing a kill marker
30939// because there were multiple uses of EFLAGS, and ISel didn't know
30940// which to mark. Figure out whether SelectItr should have had a
30941// kill marker, and set it if it should. Returns the correct kill
30942// marker value.
30943static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
30944 MachineBasicBlock* BB,
30945 const TargetRegisterInfo* TRI) {
30946 // Scan forward through BB for a use/def of EFLAGS.
30947 MachineBasicBlock::iterator miI(std::next(SelectItr));
30948 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
30949 const MachineInstr& mi = *miI;
30950 if (mi.readsRegister(X86::EFLAGS))
30951 return false;
30952 if (mi.definesRegister(X86::EFLAGS))
30953 break; // Should have kill-flag - update below.
30954 }
30955
30956 // If we hit the end of the block, check whether EFLAGS is live into a
30957 // successor.
30958 if (miI == BB->end()) {
30959 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
30960 sEnd = BB->succ_end();
30961 sItr != sEnd; ++sItr) {
30962 MachineBasicBlock* succ = *sItr;
30963 if (succ->isLiveIn(X86::EFLAGS))
30964 return false;
30965 }
30966 }
30967
30968 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
30969 // out. SelectMI should have a kill flag on EFLAGS.
30970 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
30971 return true;
30972}
30973
30974// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
30975// together with other CMOV pseudo-opcodes into a single basic-block with
30976// conditional jump around it.
30977static bool isCMOVPseudo(MachineInstr &MI) {
30978 switch (MI.getOpcode()) {
30979 case X86::CMOV_FR32:
30980 case X86::CMOV_FR32X:
30981 case X86::CMOV_FR64:
30982 case X86::CMOV_FR64X:
30983 case X86::CMOV_GR8:
30984 case X86::CMOV_GR16:
30985 case X86::CMOV_GR32:
30986 case X86::CMOV_RFP32:
30987 case X86::CMOV_RFP64:
30988 case X86::CMOV_RFP80:
30989 case X86::CMOV_VR64:
30990 case X86::CMOV_VR128:
30991 case X86::CMOV_VR128X:
30992 case X86::CMOV_VR256:
30993 case X86::CMOV_VR256X:
30994 case X86::CMOV_VR512:
30995 case X86::CMOV_VK1:
30996 case X86::CMOV_VK2:
30997 case X86::CMOV_VK4:
30998 case X86::CMOV_VK8:
30999 case X86::CMOV_VK16:
31000 case X86::CMOV_VK32:
31001 case X86::CMOV_VK64:
31002 return true;
31003
31004 default:
31005 return false;
31006 }
31007}
31008
31009// Helper function, which inserts PHI functions into SinkMBB:
31010// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
31011// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
31012// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
31013// the last PHI function inserted.
31014static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
31015 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
31016 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
31017 MachineBasicBlock *SinkMBB) {
31018 MachineFunction *MF = TrueMBB->getParent();
31019 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
31020 DebugLoc DL = MIItBegin->getDebugLoc();
31021
31022 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
31023 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
31024
31025 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
31026
31027 // As we are creating the PHIs, we have to be careful if there is more than
31028 // one. Later CMOVs may reference the results of earlier CMOVs, but later
31029 // PHIs have to reference the individual true/false inputs from earlier PHIs.
31030 // That also means that PHI construction must work forward from earlier to
31031 // later, and that the code must maintain a mapping from earlier PHI's
31032 // destination registers, and the registers that went into the PHI.
31033 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
31034 MachineInstrBuilder MIB;
31035
31036 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
31037 Register DestReg = MIIt->getOperand(0).getReg();
31038 Register Op1Reg = MIIt->getOperand(1).getReg();
31039 Register Op2Reg = MIIt->getOperand(2).getReg();
31040
31041 // If this CMOV we are generating is the opposite condition from
31042 // the jump we generated, then we have to swap the operands for the
31043 // PHI that is going to be generated.
31044 if (MIIt->getOperand(3).getImm() == OppCC)
31045 std::swap(Op1Reg, Op2Reg);
31046
31047 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
31048 Op1Reg = RegRewriteTable[Op1Reg].first;
31049
31050 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
31051 Op2Reg = RegRewriteTable[Op2Reg].second;
31052
31053 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
31054 .addReg(Op1Reg)
31055 .addMBB(FalseMBB)
31056 .addReg(Op2Reg)
31057 .addMBB(TrueMBB);
31058
31059 // Add this PHI to the rewrite table.
31060 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
31061 }
31062
31063 return MIB;
31064}
31065
31066// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
31067MachineBasicBlock *
31068X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
31069 MachineInstr &SecondCascadedCMOV,
31070 MachineBasicBlock *ThisMBB) const {
31071 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31072 DebugLoc DL = FirstCMOV.getDebugLoc();
31073
31074 // We lower cascaded CMOVs such as
31075 //
31076 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
31077 //
31078 // to two successive branches.
31079 //
31080 // Without this, we would add a PHI between the two jumps, which ends up
31081 // creating a few copies all around. For instance, for
31082 //
31083 // (sitofp (zext (fcmp une)))
31084 //
31085 // we would generate:
31086 //
31087 // ucomiss %xmm1, %xmm0
31088 // movss <1.0f>, %xmm0
31089 // movaps %xmm0, %xmm1
31090 // jne .LBB5_2
31091 // xorps %xmm1, %xmm1
31092 // .LBB5_2:
31093 // jp .LBB5_4
31094 // movaps %xmm1, %xmm0
31095 // .LBB5_4:
31096 // retq
31097 //
31098 // because this custom-inserter would have generated:
31099 //
31100 // A
31101 // | \
31102 // | B
31103 // | /
31104 // C
31105 // | \
31106 // | D
31107 // | /
31108 // E
31109 //
31110 // A: X = ...; Y = ...
31111 // B: empty
31112 // C: Z = PHI [X, A], [Y, B]
31113 // D: empty
31114 // E: PHI [X, C], [Z, D]
31115 //
31116 // If we lower both CMOVs in a single step, we can instead generate:
31117 //
31118 // A
31119 // | \
31120 // | C
31121 // | /|
31122 // |/ |
31123 // | |
31124 // | D
31125 // | /
31126 // E
31127 //
31128 // A: X = ...; Y = ...
31129 // D: empty
31130 // E: PHI [X, A], [X, C], [Y, D]
31131 //
31132 // Which, in our sitofp/fcmp example, gives us something like:
31133 //
31134 // ucomiss %xmm1, %xmm0
31135 // movss <1.0f>, %xmm0
31136 // jne .LBB5_4
31137 // jp .LBB5_4
31138 // xorps %xmm0, %xmm0
31139 // .LBB5_4:
31140 // retq
31141 //
31142
31143 // We lower cascaded CMOV into two successive branches to the same block.
31144 // EFLAGS is used by both, so mark it as live in the second.
31145 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
31146 MachineFunction *F = ThisMBB->getParent();
31147 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
31148 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
31149 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
31150
31151 MachineFunction::iterator It = ++ThisMBB->getIterator();
31152 F->insert(It, FirstInsertedMBB);
31153 F->insert(It, SecondInsertedMBB);
31154 F->insert(It, SinkMBB);
31155
31156 // For a cascaded CMOV, we lower it to two successive branches to
31157 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
31158 // the FirstInsertedMBB.
31159 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
31160
31161 // If the EFLAGS register isn't dead in the terminator, then claim that it's
31162 // live into the sink and copy blocks.
31163 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31164 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
31165 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
31166 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
31167 SinkMBB->addLiveIn(X86::EFLAGS);
31168 }
31169
31170 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
31171 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
31172 std::next(MachineBasicBlock::iterator(FirstCMOV)),
31173 ThisMBB->end());
31174 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
31175
31176 // Fallthrough block for ThisMBB.
31177 ThisMBB->addSuccessor(FirstInsertedMBB);
31178 // The true block target of the first branch is always SinkMBB.
31179 ThisMBB->addSuccessor(SinkMBB);
31180 // Fallthrough block for FirstInsertedMBB.
31181 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
31182 // The true block for the branch of FirstInsertedMBB.
31183 FirstInsertedMBB->addSuccessor(SinkMBB);
31184 // This is fallthrough.
31185 SecondInsertedMBB->addSuccessor(SinkMBB);
31186
31187 // Create the conditional branch instructions.
31188 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
31189 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
31190
31191 X86::CondCode SecondCC =
31192 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
31193 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
31194
31195 // SinkMBB:
31196 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
31197 Register DestReg = FirstCMOV.getOperand(0).getReg();
31198 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
31199 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
31200 MachineInstrBuilder MIB =
31201 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
31202 .addReg(Op1Reg)
31203 .addMBB(SecondInsertedMBB)
31204 .addReg(Op2Reg)
31205 .addMBB(ThisMBB);
31206
31207 // The second SecondInsertedMBB provides the same incoming value as the
31208 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
31209 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
31210 // Copy the PHI result to the register defined by the second CMOV.
31211 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
31212 TII->get(TargetOpcode::COPY),
31213 SecondCascadedCMOV.getOperand(0).getReg())
31214 .addReg(FirstCMOV.getOperand(0).getReg());
31215
31216 // Now remove the CMOVs.
31217 FirstCMOV.eraseFromParent();
31218 SecondCascadedCMOV.eraseFromParent();
31219
31220 return SinkMBB;
31221}
31222
31223MachineBasicBlock *
31224X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
31225 MachineBasicBlock *ThisMBB) const {
31226 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31227 DebugLoc DL = MI.getDebugLoc();
31228
31229 // To "insert" a SELECT_CC instruction, we actually have to insert the
31230 // diamond control-flow pattern. The incoming instruction knows the
31231 // destination vreg to set, the condition code register to branch on, the
31232 // true/false values to select between and a branch opcode to use.
31233
31234 // ThisMBB:
31235 // ...
31236 // TrueVal = ...
31237 // cmpTY ccX, r1, r2
31238 // bCC copy1MBB
31239 // fallthrough --> FalseMBB
31240
31241 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
31242 // as described above, by inserting a BB, and then making a PHI at the join
31243 // point to select the true and false operands of the CMOV in the PHI.
31244 //
31245 // The code also handles two different cases of multiple CMOV opcodes
31246 // in a row.
31247 //
31248 // Case 1:
31249 // In this case, there are multiple CMOVs in a row, all which are based on
31250 // the same condition setting (or the exact opposite condition setting).
31251 // In this case we can lower all the CMOVs using a single inserted BB, and
31252 // then make a number of PHIs at the join point to model the CMOVs. The only
31253 // trickiness here, is that in a case like:
31254 //
31255 // t2 = CMOV cond1 t1, f1
31256 // t3 = CMOV cond1 t2, f2
31257 //
31258 // when rewriting this into PHIs, we have to perform some renaming on the
31259 // temps since you cannot have a PHI operand refer to a PHI result earlier
31260 // in the same block. The "simple" but wrong lowering would be:
31261 //
31262 // t2 = PHI t1(BB1), f1(BB2)
31263 // t3 = PHI t2(BB1), f2(BB2)
31264 //
31265 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
31266 // renaming is to note that on the path through BB1, t2 is really just a
31267 // copy of t1, and do that renaming, properly generating:
31268 //
31269 // t2 = PHI t1(BB1), f1(BB2)
31270 // t3 = PHI t1(BB1), f2(BB2)
31271 //
31272 // Case 2:
31273 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
31274 // function - EmitLoweredCascadedSelect.
31275
31276 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
31277 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
31278 MachineInstr *LastCMOV = &MI;
31279 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
31280
31281 // Check for case 1, where there are multiple CMOVs with the same condition
31282 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
31283 // number of jumps the most.
31284
31285 if (isCMOVPseudo(MI)) {
31286 // See if we have a string of CMOVS with the same condition. Skip over
31287 // intervening debug insts.
31288 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
31289 (NextMIIt->getOperand(3).getImm() == CC ||
31290 NextMIIt->getOperand(3).getImm() == OppCC)) {
31291 LastCMOV = &*NextMIIt;
31292 ++NextMIIt;
31293 NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
31294 }
31295 }
31296
31297 // This checks for case 2, but only do this if we didn't already find
31298 // case 1, as indicated by LastCMOV == MI.
31299 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
31300 NextMIIt->getOpcode() == MI.getOpcode() &&
31301 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
31302 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
31303 NextMIIt->getOperand(1).isKill()) {
31304 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
31305 }
31306
31307 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
31308 MachineFunction *F = ThisMBB->getParent();
31309 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
31310 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
31311
31312 MachineFunction::iterator It = ++ThisMBB->getIterator();
31313 F->insert(It, FalseMBB);
31314 F->insert(It, SinkMBB);
31315
31316 // If the EFLAGS register isn't dead in the terminator, then claim that it's
31317 // live into the sink and copy blocks.
31318 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31319 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
31320 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
31321 FalseMBB->addLiveIn(X86::EFLAGS);
31322 SinkMBB->addLiveIn(X86::EFLAGS);
31323 }
31324
31325 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
31326 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
31327 auto DbgIt = MachineBasicBlock::iterator(MI);
31328 while (DbgIt != DbgEnd) {
31329 auto Next = std::next(DbgIt);
31330 if (DbgIt->isDebugInstr())
31331 SinkMBB->push_back(DbgIt->removeFromParent());
31332 DbgIt = Next;
31333 }
31334
31335 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
31336 SinkMBB->splice(SinkMBB->end(), ThisMBB,
31337 std::next(MachineBasicBlock::iterator(LastCMOV)),
31338 ThisMBB->end());
31339 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
31340
31341 // Fallthrough block for ThisMBB.
31342 ThisMBB->addSuccessor(FalseMBB);
31343 // The true block target of the first (or only) branch is always a SinkMBB.
31344 ThisMBB->addSuccessor(SinkMBB);
31345 // Fallthrough block for FalseMBB.
31346 FalseMBB->addSuccessor(SinkMBB);
31347
31348 // Create the conditional branch instruction.
31349 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
31350
31351 // SinkMBB:
31352 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
31353 // ...
31354 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
31355 MachineBasicBlock::iterator MIItEnd =
31356 std::next(MachineBasicBlock::iterator(LastCMOV));
31357 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
31358
31359 // Now remove the CMOV(s).
31360 ThisMBB->erase(MIItBegin, MIItEnd);
31361
31362 return SinkMBB;
31363}
31364
31365MachineBasicBlock *
31366X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
31367 MachineBasicBlock *BB) const {
31368 MachineFunction *MF = BB->getParent();
31369 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31370 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
31371 DebugLoc DL = MI.getDebugLoc();
31372 const BasicBlock *LLVM_BB = BB->getBasicBlock();
31373
31374 const unsigned ProbeSize = getStackProbeSize(*MF);
31375
31376 MachineRegisterInfo &MRI = MF->getRegInfo();
31377 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31378 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31379 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31380
31381 MachineFunction::iterator MBBIter = ++BB->getIterator();
31382 MF->insert(MBBIter, testMBB);
31383 MF->insert(MBBIter, blockMBB);
31384 MF->insert(MBBIter, tailMBB);
31385
31386 unsigned sizeVReg = MI.getOperand(1).getReg();
31387
31388 const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg);
31389
31390 unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass);
31391 unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass);
31392
31393 unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
31394
31395 // test rsp size
31396 BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg)
31397 .addReg(sizeVReg)
31398 .addMBB(BB)
31399 .addReg(tmpSizeVReg2)
31400 .addMBB(blockMBB);
31401
31402 BuildMI(testMBB, DL,
31403 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri))
31404 .addReg(tmpSizeVReg)
31405 .addImm(ProbeSize);
31406
31407 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
31408 .addMBB(tailMBB)
31409 .addImm(X86::COND_L);
31410 testMBB->addSuccessor(blockMBB);
31411 testMBB->addSuccessor(tailMBB);
31412
31413 // allocate a block and touch it
31414
31415 BuildMI(blockMBB, DL,
31416 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
31417 tmpSizeVReg2)
31418 .addReg(tmpSizeVReg)
31419 .addImm(ProbeSize);
31420
31421 BuildMI(blockMBB, DL,
31422 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
31423 physSPReg)
31424 .addReg(physSPReg)
31425 .addImm(ProbeSize);
31426
31427 const unsigned MovMIOpc =
31428 TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
31429 addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
31430 .addImm(0);
31431
31432 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
31433 blockMBB->addSuccessor(testMBB);
31434
31435 // allocate the tail and continue
31436 BuildMI(tailMBB, DL,
31437 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr),
31438 physSPReg)
31439 .addReg(physSPReg)
31440 .addReg(tmpSizeVReg);
31441 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
31442 .addReg(physSPReg);
31443
31444 tailMBB->splice(tailMBB->end(), BB,
31445 std::next(MachineBasicBlock::iterator(MI)), BB->end());
31446 tailMBB->transferSuccessorsAndUpdatePHIs(BB);
31447 BB->addSuccessor(testMBB);
31448
31449 // Delete the original pseudo instruction.
31450 MI.eraseFromParent();
31451
31452 // And we're done.
31453 return tailMBB;
31454}
31455
31456MachineBasicBlock *
31457X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
31458 MachineBasicBlock *BB) const {
31459 MachineFunction *MF = BB->getParent();
31460 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31461 DebugLoc DL = MI.getDebugLoc();
31462 const BasicBlock *LLVM_BB = BB->getBasicBlock();
31463
31464 assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31464, __PRETTY_FUNCTION__))
;
31465
31466 const bool Is64Bit = Subtarget.is64Bit();
31467 const bool IsLP64 = Subtarget.isTarget64BitLP64();
31468
31469 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
31470 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
31471
31472 // BB:
31473 // ... [Till the alloca]
31474 // If stacklet is not large enough, jump to mallocMBB
31475 //
31476 // bumpMBB:
31477 // Allocate by subtracting from RSP
31478 // Jump to continueMBB
31479 //
31480 // mallocMBB:
31481 // Allocate by call to runtime
31482 //
31483 // continueMBB:
31484 // ...
31485 // [rest of original BB]
31486 //
31487
31488 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31489 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31490 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31491
31492 MachineRegisterInfo &MRI = MF->getRegInfo();
31493 const TargetRegisterClass *AddrRegClass =
31494 getRegClassFor(getPointerTy(MF->getDataLayout()));
31495
31496 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
31497 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
31498 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
31499 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
31500 sizeVReg = MI.getOperand(1).getReg(),
31501 physSPReg =
31502 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
31503
31504 MachineFunction::iterator MBBIter = ++BB->getIterator();
31505
31506 MF->insert(MBBIter, bumpMBB);
31507 MF->insert(MBBIter, mallocMBB);
31508 MF->insert(MBBIter, continueMBB);
31509
31510 continueMBB->splice(continueMBB->begin(), BB,
31511 std::next(MachineBasicBlock::iterator(MI)), BB->end());
31512 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
31513
31514 // Add code to the main basic block to check if the stack limit has been hit,
31515 // and if so, jump to mallocMBB otherwise to bumpMBB.
31516 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
31517 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
31518 .addReg(tmpSPVReg).addReg(sizeVReg);
31519 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
31520 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
31521 .addReg(SPLimitVReg);
31522 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
31523
31524 // bumpMBB simply decreases the stack pointer, since we know the current
31525 // stacklet has enough space.
31526 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
31527 .addReg(SPLimitVReg);
31528 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
31529 .addReg(SPLimitVReg);
31530 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
31531
31532 // Calls into a routine in libgcc to allocate more space from the heap.
31533 const uint32_t *RegMask =
31534 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
31535 if (IsLP64) {
31536 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
31537 .addReg(sizeVReg);
31538 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
31539 .addExternalSymbol("__morestack_allocate_stack_space")
31540 .addRegMask(RegMask)
31541 .addReg(X86::RDI, RegState::Implicit)
31542 .addReg(X86::RAX, RegState::ImplicitDefine);
31543 } else if (Is64Bit) {
31544 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
31545 .addReg(sizeVReg);
31546 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
31547 .addExternalSymbol("__morestack_allocate_stack_space")
31548 .addRegMask(RegMask)
31549 .addReg(X86::EDI, RegState::Implicit)
31550 .addReg(X86::EAX, RegState::ImplicitDefine);
31551 } else {
31552 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
31553 .addImm(12);
31554 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
31555 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
31556 .addExternalSymbol("__morestack_allocate_stack_space")
31557 .addRegMask(RegMask)
31558 .addReg(X86::EAX, RegState::ImplicitDefine);
31559 }
31560
31561 if (!Is64Bit)
31562 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
31563 .addImm(16);
31564
31565 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
31566 .addReg(IsLP64 ? X86::RAX : X86::EAX);
31567 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
31568
31569 // Set up the CFG correctly.
31570 BB->addSuccessor(bumpMBB);
31571 BB->addSuccessor(mallocMBB);
31572 mallocMBB->addSuccessor(continueMBB);
31573 bumpMBB->addSuccessor(continueMBB);
31574
31575 // Take care of the PHI nodes.
31576 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
31577 MI.getOperand(0).getReg())
31578 .addReg(mallocPtrVReg)
31579 .addMBB(mallocMBB)
31580 .addReg(bumpSPPtrVReg)
31581 .addMBB(bumpMBB);
31582
31583 // Delete the original pseudo instruction.
31584 MI.eraseFromParent();
31585
31586 // And we're done.
31587 return continueMBB;
31588}
31589
31590MachineBasicBlock *
31591X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
31592 MachineBasicBlock *BB) const {
31593 MachineFunction *MF = BB->getParent();
31594 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
31595 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
31596 DebugLoc DL = MI.getDebugLoc();
31597
31598 assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31600, __PRETTY_FUNCTION__))
31599 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31600, __PRETTY_FUNCTION__))
31600 "SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31600, __PRETTY_FUNCTION__))
;
31601
31602 // Only 32-bit EH needs to worry about manually restoring stack pointers.
31603 if (!Subtarget.is32Bit())
31604 return BB;
31605
31606 // C++ EH creates a new target block to hold the restore code, and wires up
31607 // the new block to the return destination with a normal JMP_4.
31608 MachineBasicBlock *RestoreMBB =
31609 MF->CreateMachineBasicBlock(BB->getBasicBlock());
31610 assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail
("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31610, __PRETTY_FUNCTION__))
;
31611 MF->insert(std::next(BB->getIterator()), RestoreMBB);
31612 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
31613 BB->addSuccessor(RestoreMBB);
31614 MI.getOperand(0).setMBB(RestoreMBB);
31615
31616 // Marking this as an EH pad but not a funclet entry block causes PEI to
31617 // restore stack pointers in the block.
31618 RestoreMBB->setIsEHPad(true);
31619
31620 auto RestoreMBBI = RestoreMBB->begin();
31621 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
31622 return BB;
31623}
31624
31625MachineBasicBlock *
31626X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
31627 MachineBasicBlock *BB) const {
31628 // So, here we replace TLSADDR with the sequence:
31629 // adjust_stackdown -> TLSADDR -> adjust_stackup.
31630 // We need this because TLSADDR is lowered into calls
31631 // inside MC, therefore without the two markers shrink-wrapping
31632 // may push the prologue/epilogue pass them.
31633 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
31634 DebugLoc DL = MI.getDebugLoc();
31635 MachineFunction &MF = *BB->getParent();
31636
31637 // Emit CALLSEQ_START right before the instruction.
31638 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
31639 MachineInstrBuilder CallseqStart =
31640 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
31641 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
31642
31643 // Emit CALLSEQ_END right after the instruction.
31644 // We don't call erase from parent because we want to keep the
31645 // original instruction around.
31646 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
31647 MachineInstrBuilder CallseqEnd =
31648 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
31649 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
31650
31651 return BB;
31652}
31653
31654MachineBasicBlock *
31655X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
31656 MachineBasicBlock *BB) const {
31657 // This is pretty easy. We're taking the value that we received from
31658 // our load from the relocation, sticking it in either RDI (x86-64)
31659 // or EAX and doing an indirect call. The return value will then
31660 // be in the normal return register.
31661 MachineFunction *F = BB->getParent();
31662 const X86InstrInfo *TII = Subtarget.getInstrInfo();
31663 DebugLoc DL = MI.getDebugLoc();
31664
31665 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31665, __PRETTY_FUNCTION__))
;
31666 assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31666, __PRETTY_FUNCTION__))
;
31667
31668 // Get a register mask for the lowered call.
31669 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
31670 // proper register mask.
31671 const uint32_t *RegMask =
31672 Subtarget.is64Bit() ?
31673 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
31674 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
31675 if (Subtarget.is64Bit()) {
31676 MachineInstrBuilder MIB =
31677 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
31678 .addReg(X86::RIP)
31679 .addImm(0)
31680 .addReg(0)
31681 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31682 MI.getOperand(3).getTargetFlags())
31683 .addReg(0);
31684 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
31685 addDirectMem(MIB, X86::RDI);
31686 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
31687 } else if (!isPositionIndependent()) {
31688 MachineInstrBuilder MIB =
31689 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
31690 .addReg(0)
31691 .addImm(0)
31692 .addReg(0)
31693 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31694 MI.getOperand(3).getTargetFlags())
31695 .addReg(0);
31696 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
31697 addDirectMem(MIB, X86::EAX);
31698 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
31699 } else {
31700 MachineInstrBuilder MIB =
31701 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
31702 .addReg(TII->getGlobalBaseReg(F))
31703 .addImm(0)
31704 .addReg(0)
31705 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31706 MI.getOperand(3).getTargetFlags())
31707 .addReg(0);
31708 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
31709 addDirectMem(MIB, X86::EAX);
31710 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
31711 }
31712
31713 MI.eraseFromParent(); // The pseudo instruction is gone now.
31714 return BB;
31715}
31716
31717static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
31718 switch (RPOpc) {
31719 case X86::RETPOLINE_CALL32:
31720 return X86::CALLpcrel32;
31721 case X86::RETPOLINE_CALL64:
31722 return X86::CALL64pcrel32;
31723 case X86::RETPOLINE_TCRETURN32:
31724 return X86::TCRETURNdi;
31725 case X86::RETPOLINE_TCRETURN64:
31726 return X86::TCRETURNdi64;
31727 }
31728 llvm_unreachable("not retpoline opcode")::llvm::llvm_unreachable_internal("not retpoline opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31728)
;
31729}
31730
31731static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
31732 unsigned Reg) {
31733 if (Subtarget.useRetpolineExternalThunk()) {
31734 // When using an external thunk for retpolines, we pick names that match the
31735 // names GCC happens to use as well. This helps simplify the implementation
31736 // of the thunks for kernels where they have no easy ability to create
31737 // aliases and are doing non-trivial configuration of the thunk's body. For
31738 // example, the Linux kernel will do boot-time hot patching of the thunk
31739 // bodies and cannot easily export aliases of these to loaded modules.
31740 //
31741 // Note that at any point in the future, we may need to change the semantics
31742 // of how we implement retpolines and at that time will likely change the
31743 // name of the called thunk. Essentially, there is no hard guarantee that
31744 // LLVM will generate calls to specific thunks, we merely make a best-effort
31745 // attempt to help out kernels and other systems where duplicating the
31746 // thunks is costly.
31747 switch (Reg) {
31748 case X86::EAX:
31749 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31749, __PRETTY_FUNCTION__))
;
31750 return "__x86_indirect_thunk_eax";
31751 case X86::ECX:
31752 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31752, __PRETTY_FUNCTION__))
;
31753 return "__x86_indirect_thunk_ecx";
31754 case X86::EDX:
31755 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31755, __PRETTY_FUNCTION__))
;
31756 return "__x86_indirect_thunk_edx";
31757 case X86::EDI:
31758 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31758, __PRETTY_FUNCTION__))
;
31759 return "__x86_indirect_thunk_edi";
31760 case X86::R11:
31761 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31761, __PRETTY_FUNCTION__))
;
31762 return "__x86_indirect_thunk_r11";
31763 }
31764 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31764)
;
31765 }
31766
31767 // When targeting an internal COMDAT thunk use an LLVM-specific name.
31768 switch (Reg) {
31769 case X86::EAX:
31770 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31770, __PRETTY_FUNCTION__))
;
31771 return "__llvm_retpoline_eax";
31772 case X86::ECX:
31773 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31773, __PRETTY_FUNCTION__))
;
31774 return "__llvm_retpoline_ecx";
31775 case X86::EDX:
31776 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31776, __PRETTY_FUNCTION__))
;
31777 return "__llvm_retpoline_edx";
31778 case X86::EDI:
31779 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31779, __PRETTY_FUNCTION__))
;
31780 return "__llvm_retpoline_edi";
31781 case X86::R11:
31782 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31782, __PRETTY_FUNCTION__))
;
31783 return "__llvm_retpoline_r11";
31784 }
31785 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31785)
;
31786}
31787
31788MachineBasicBlock *
31789X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
31790 MachineBasicBlock *BB) const {
31791 // Copy the virtual register into the R11 physical register and
31792 // call the retpoline thunk.
31793 DebugLoc DL = MI.getDebugLoc();
31794 const X86InstrInfo *TII = Subtarget.getInstrInfo();
31795 Register CalleeVReg = MI.getOperand(0).getReg();
31796 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
31797
31798 // Find an available scratch register to hold the callee. On 64-bit, we can
31799 // just use R11, but we scan for uses anyway to ensure we don't generate
31800 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
31801 // already a register use operand to the call to hold the callee. If none
31802 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
31803 // register and ESI is the base pointer to realigned stack frames with VLAs.
31804 SmallVector<unsigned, 3> AvailableRegs;
31805 if (Subtarget.is64Bit())
31806 AvailableRegs.push_back(X86::R11);
31807 else
31808 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
31809
31810 // Zero out any registers that are already used.
31811 for (const auto &MO : MI.operands()) {
31812 if (MO.isReg() && MO.isUse())
31813 for (unsigned &Reg : AvailableRegs)
31814 if (Reg == MO.getReg())
31815 Reg = 0;
31816 }
31817
31818 // Choose the first remaining non-zero available register.
31819 unsigned AvailableReg = 0;
31820 for (unsigned MaybeReg : AvailableRegs) {
31821 if (MaybeReg) {
31822 AvailableReg = MaybeReg;
31823 break;
31824 }
31825 }
31826 if (!AvailableReg)
31827 report_fatal_error("calling convention incompatible with retpoline, no "
31828 "available registers");
31829
31830 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
31831
31832 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
31833 .addReg(CalleeVReg);
31834 MI.getOperand(0).ChangeToES(Symbol);
31835 MI.setDesc(TII->get(Opc));
31836 MachineInstrBuilder(*BB->getParent(), &MI)
31837 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
31838 return BB;
31839}
31840
31841/// SetJmp implies future control flow change upon calling the corresponding
31842/// LongJmp.
31843/// Instead of using the 'return' instruction, the long jump fixes the stack and
31844/// performs an indirect branch. To do so it uses the registers that were stored
31845/// in the jump buffer (when calling SetJmp).
31846/// In case the shadow stack is enabled we need to fix it as well, because some
31847/// return addresses will be skipped.
31848/// The function will save the SSP for future fixing in the function
31849/// emitLongJmpShadowStackFix.
31850/// \sa emitLongJmpShadowStackFix
31851/// \param [in] MI The temporary Machine Instruction for the builtin.
31852/// \param [in] MBB The Machine Basic Block that will be modified.
31853void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
31854 MachineBasicBlock *MBB) const {
31855 DebugLoc DL = MI.getDebugLoc();
31856 MachineFunction *MF = MBB->getParent();
31857 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31858 MachineRegisterInfo &MRI = MF->getRegInfo();
31859 MachineInstrBuilder MIB;
31860
31861 // Memory Reference.
31862 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
31863 MI.memoperands_end());
31864
31865 // Initialize a register with zero.
31866 MVT PVT = getPointerTy(MF->getDataLayout());
31867 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
31868 Register ZReg = MRI.createVirtualRegister(PtrRC);
31869 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
31870 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
31871 .addDef(ZReg)
31872 .addReg(ZReg, RegState::Undef)
31873 .addReg(ZReg, RegState::Undef);
31874
31875 // Read the current SSP Register value to the zeroed register.
31876 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
31877 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
31878 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
31879
31880 // Write the SSP register value to offset 3 in input memory buffer.
31881 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
31882 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
31883 const int64_t SSPOffset = 3 * PVT.getStoreSize();
31884 const unsigned MemOpndSlot = 1;
31885 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
31886 if (i == X86::AddrDisp)
31887 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
31888 else
31889 MIB.add(MI.getOperand(MemOpndSlot + i));
31890 }
31891 MIB.addReg(SSPCopyReg);
31892 MIB.setMemRefs(MMOs);
31893}
31894
31895MachineBasicBlock *
31896X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
31897 MachineBasicBlock *MBB) const {
31898 DebugLoc DL = MI.getDebugLoc();
31899 MachineFunction *MF = MBB->getParent();
31900 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31901 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31902 MachineRegisterInfo &MRI = MF->getRegInfo();
31903
31904 const BasicBlock *BB = MBB->getBasicBlock();
31905 MachineFunction::iterator I = ++MBB->getIterator();
31906
31907 // Memory Reference
31908 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
31909 MI.memoperands_end());
31910
31911 unsigned DstReg;
31912 unsigned MemOpndSlot = 0;
31913
31914 unsigned CurOp = 0;
31915
31916 DstReg = MI.getOperand(CurOp++).getReg();
31917 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
31918 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"
) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31918, __PRETTY_FUNCTION__))
;
31919 (void)TRI;
31920 Register mainDstReg = MRI.createVirtualRegister(RC);
31921 Register restoreDstReg = MRI.createVirtualRegister(RC);
31922
31923 MemOpndSlot = CurOp;
31924
31925 MVT PVT = getPointerTy(MF->getDataLayout());
31926 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31927, __PRETTY_FUNCTION__))
31927 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31927, __PRETTY_FUNCTION__))
;
31928
31929 // For v = setjmp(buf), we generate
31930 //
31931 // thisMBB:
31932 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
31933 // SjLjSetup restoreMBB
31934 //
31935 // mainMBB:
31936 // v_main = 0
31937 //
31938 // sinkMBB:
31939 // v = phi(main, restore)
31940 //
31941 // restoreMBB:
31942 // if base pointer being used, load it from frame
31943 // v_restore = 1
31944
31945 MachineBasicBlock *thisMBB = MBB;
31946 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
31947 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
31948 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
31949 MF->insert(I, mainMBB);
31950 MF->insert(I, sinkMBB);
31951 MF->push_back(restoreMBB);
31952 restoreMBB->setHasAddressTaken();
31953
31954 MachineInstrBuilder MIB;
31955
31956 // Transfer the remainder of BB and its successor edges to sinkMBB.
31957 sinkMBB->splice(sinkMBB->begin(), MBB,
31958 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
31959 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
31960
31961 // thisMBB:
31962 unsigned PtrStoreOpc = 0;
31963 unsigned LabelReg = 0;
31964 const int64_t LabelOffset = 1 * PVT.getStoreSize();
31965 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
31966 !isPositionIndependent();
31967
31968 // Prepare IP either in reg or imm.
31969 if (!UseImmLabel) {
31970 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
31971 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
31972 LabelReg = MRI.createVirtualRegister(PtrRC);
31973 if (Subtarget.is64Bit()) {
31974 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
31975 .addReg(X86::RIP)
31976 .addImm(0)
31977 .addReg(0)
31978 .addMBB(restoreMBB)
31979 .addReg(0);
31980 } else {
31981 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
31982 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
31983 .addReg(XII->getGlobalBaseReg(MF))
31984 .addImm(0)
31985 .addReg(0)
31986 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
31987 .addReg(0);
31988 }
31989 } else
31990 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
31991 // Store IP
31992 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
31993 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
31994 if (i == X86::AddrDisp)
31995 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
31996 else
31997 MIB.add(MI.getOperand(MemOpndSlot + i));
31998 }
31999 if (!UseImmLabel)
32000 MIB.addReg(LabelReg);
32001 else
32002 MIB.addMBB(restoreMBB);
32003 MIB.setMemRefs(MMOs);
32004
32005 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
32006 emitSetJmpShadowStackFix(MI, thisMBB);
32007 }
32008
32009 // Setup
32010 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
32011 .addMBB(restoreMBB);
32012
32013 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
32014 MIB.addRegMask(RegInfo->getNoPreservedMask());
32015 thisMBB->addSuccessor(mainMBB);
32016 thisMBB->addSuccessor(restoreMBB);
32017
32018 // mainMBB:
32019 // EAX = 0
32020 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
32021 mainMBB->addSuccessor(sinkMBB);
32022
32023 // sinkMBB:
32024 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
32025 TII->get(X86::PHI), DstReg)
32026 .addReg(mainDstReg).addMBB(mainMBB)
32027 .addReg(restoreDstReg).addMBB(restoreMBB);
32028
32029 // restoreMBB:
32030 if (RegInfo->hasBasePointer(*MF)) {
32031 const bool Uses64BitFramePtr =
32032 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
32033 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
32034 X86FI->setRestoreBasePointer(MF);
32035 Register FramePtr = RegInfo->getFrameRegister(*MF);
32036 Register BasePtr = RegInfo->getBaseRegister();
32037 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
32038 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
32039 FramePtr, true, X86FI->getRestoreBasePointerOffset())
32040 .setMIFlag(MachineInstr::FrameSetup);
32041 }
32042 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
32043 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32044 restoreMBB->addSuccessor(sinkMBB);
32045
32046 MI.eraseFromParent();
32047 return sinkMBB;
32048}
32049
32050/// Fix the shadow stack using the previously saved SSP pointer.
32051/// \sa emitSetJmpShadowStackFix
32052/// \param [in] MI The temporary Machine Instruction for the builtin.
32053/// \param [in] MBB The Machine Basic Block that will be modified.
32054/// \return The sink MBB that will perform the future indirect branch.
32055MachineBasicBlock *
32056X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
32057 MachineBasicBlock *MBB) const {
32058 DebugLoc DL = MI.getDebugLoc();
32059 MachineFunction *MF = MBB->getParent();
32060 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32061 MachineRegisterInfo &MRI = MF->getRegInfo();
32062
32063 // Memory Reference
32064 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32065 MI.memoperands_end());
32066
32067 MVT PVT = getPointerTy(MF->getDataLayout());
32068 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
32069
32070 // checkSspMBB:
32071 // xor vreg1, vreg1
32072 // rdssp vreg1
32073 // test vreg1, vreg1
32074 // je sinkMBB # Jump if Shadow Stack is not supported
32075 // fallMBB:
32076 // mov buf+24/12(%rip), vreg2
32077 // sub vreg1, vreg2
32078 // jbe sinkMBB # No need to fix the Shadow Stack
32079 // fixShadowMBB:
32080 // shr 3/2, vreg2
32081 // incssp vreg2 # fix the SSP according to the lower 8 bits
32082 // shr 8, vreg2
32083 // je sinkMBB
32084 // fixShadowLoopPrepareMBB:
32085 // shl vreg2
32086 // mov 128, vreg3
32087 // fixShadowLoopMBB:
32088 // incssp vreg3
32089 // dec vreg2
32090 // jne fixShadowLoopMBB # Iterate until you finish fixing
32091 // # the Shadow Stack
32092 // sinkMBB:
32093
32094 MachineFunction::iterator I = ++MBB->getIterator();
32095 const BasicBlock *BB = MBB->getBasicBlock();
32096
32097 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
32098 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32099 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
32100 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
32101 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
32102 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32103 MF->insert(I, checkSspMBB);
32104 MF->insert(I, fallMBB);
32105 MF->insert(I, fixShadowMBB);
32106 MF->insert(I, fixShadowLoopPrepareMBB);
32107 MF->insert(I, fixShadowLoopMBB);
32108 MF->insert(I, sinkMBB);
32109
32110 // Transfer the remainder of BB and its successor edges to sinkMBB.
32111 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
32112 MBB->end());
32113 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32114
32115 MBB->addSuccessor(checkSspMBB);
32116
32117 // Initialize a register with zero.
32118 Register ZReg = MRI.createVirtualRegister(PtrRC);
32119 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
32120 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
32121 .addDef(ZReg)
32122 .addReg(ZReg, RegState::Undef)
32123 .addReg(ZReg, RegState::Undef);
32124
32125 // Read the current SSP Register value to the zeroed register.
32126 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
32127 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
32128 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
32129
32130 // Check whether the result of the SSP register is zero and jump directly
32131 // to the sink.
32132 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
32133 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
32134 .addReg(SSPCopyReg)
32135 .addReg(SSPCopyReg);
32136 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
32137 checkSspMBB->addSuccessor(sinkMBB);
32138 checkSspMBB->addSuccessor(fallMBB);
32139
32140 // Reload the previously saved SSP register value.
32141 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
32142 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
32143 const int64_t SPPOffset = 3 * PVT.getStoreSize();
32144 MachineInstrBuilder MIB =
32145 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
32146 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32147 const MachineOperand &MO = MI.getOperand(i);
32148 if (i == X86::AddrDisp)
32149 MIB.addDisp(MO, SPPOffset);
32150 else if (MO.isReg()) // Don't add the whole operand, we don't want to
32151 // preserve kill flags.
32152 MIB.addReg(MO.getReg());
32153 else
32154 MIB.add(MO);
32155 }
32156 MIB.setMemRefs(MMOs);
32157
32158 // Subtract the current SSP from the previous SSP.
32159 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
32160 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
32161 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
32162 .addReg(PrevSSPReg)
32163 .addReg(SSPCopyReg);
32164
32165 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
32166 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
32167 fallMBB->addSuccessor(sinkMBB);
32168 fallMBB->addSuccessor(fixShadowMBB);
32169
32170 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
32171 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
32172 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
32173 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
32174 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
32175 .addReg(SspSubReg)
32176 .addImm(Offset);
32177
32178 // Increase SSP when looking only on the lower 8 bits of the delta.
32179 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
32180 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
32181
32182 // Reset the lower 8 bits.
32183 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
32184 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
32185 .addReg(SspFirstShrReg)
32186 .addImm(8);
32187
32188 // Jump if the result of the shift is zero.
32189 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
32190 fixShadowMBB->addSuccessor(sinkMBB);
32191 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
32192
32193 // Do a single shift left.
32194 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
32195 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
32196 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
32197 .addReg(SspSecondShrReg);
32198
32199 // Save the value 128 to a register (will be used next with incssp).
32200 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
32201 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
32202 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
32203 .addImm(128);
32204 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
32205
32206 // Since incssp only looks at the lower 8 bits, we might need to do several
32207 // iterations of incssp until we finish fixing the shadow stack.
32208 Register DecReg = MRI.createVirtualRegister(PtrRC);
32209 Register CounterReg = MRI.createVirtualRegister(PtrRC);
32210 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
32211 .addReg(SspAfterShlReg)
32212 .addMBB(fixShadowLoopPrepareMBB)
32213 .addReg(DecReg)
32214 .addMBB(fixShadowLoopMBB);
32215
32216 // Every iteration we increase the SSP by 128.
32217 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
32218
32219 // Every iteration we decrement the counter by 1.
32220 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
32221 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
32222
32223 // Jump if the counter is not zero yet.
32224 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
32225 fixShadowLoopMBB->addSuccessor(sinkMBB);
32226 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
32227
32228 return sinkMBB;
32229}
32230
32231MachineBasicBlock *
32232X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
32233 MachineBasicBlock *MBB) const {
32234 DebugLoc DL = MI.getDebugLoc();
32235 MachineFunction *MF = MBB->getParent();
32236 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32237 MachineRegisterInfo &MRI = MF->getRegInfo();
32238
32239 // Memory Reference
32240 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32241 MI.memoperands_end());
32242
32243 MVT PVT = getPointerTy(MF->getDataLayout());
32244 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32245, __PRETTY_FUNCTION__))
32245 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32245, __PRETTY_FUNCTION__))
;
32246
32247 const TargetRegisterClass *RC =
32248 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
32249 Register Tmp = MRI.createVirtualRegister(RC);
32250 // Since FP is only updated here but NOT referenced, it's treated as GPR.
32251 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
32252 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
32253 Register SP = RegInfo->getStackRegister();
32254
32255 MachineInstrBuilder MIB;
32256
32257 const int64_t LabelOffset = 1 * PVT.getStoreSize();
32258 const int64_t SPOffset = 2 * PVT.getStoreSize();
32259
32260 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
32261 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
32262
32263 MachineBasicBlock *thisMBB = MBB;
32264
32265 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
32266 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
32267 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
32268 }
32269
32270 // Reload FP
32271 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
32272 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32273 const MachineOperand &MO = MI.getOperand(i);
32274 if (MO.isReg()) // Don't add the whole operand, we don't want to
32275 // preserve kill flags.
32276 MIB.addReg(MO.getReg());
32277 else
32278 MIB.add(MO);
32279 }
32280 MIB.setMemRefs(MMOs);
32281
32282 // Reload IP
32283 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
32284 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32285 const MachineOperand &MO = MI.getOperand(i);
32286 if (i == X86::AddrDisp)
32287 MIB.addDisp(MO, LabelOffset);
32288 else if (MO.isReg()) // Don't add the whole operand, we don't want to
32289 // preserve kill flags.
32290 MIB.addReg(MO.getReg());
32291 else
32292 MIB.add(MO);
32293 }
32294 MIB.setMemRefs(MMOs);
32295
32296 // Reload SP
32297 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
32298 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32299 if (i == X86::AddrDisp)
32300 MIB.addDisp(MI.getOperand(i), SPOffset);
32301 else
32302 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
32303 // the last instruction of the expansion.
32304 }
32305 MIB.setMemRefs(MMOs);
32306
32307 // Jump
32308 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
32309
32310 MI.eraseFromParent();
32311 return thisMBB;
32312}
32313
32314void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
32315 MachineBasicBlock *MBB,
32316 MachineBasicBlock *DispatchBB,
32317 int FI) const {
32318 DebugLoc DL = MI.getDebugLoc();
32319 MachineFunction *MF = MBB->getParent();
32320 MachineRegisterInfo *MRI = &MF->getRegInfo();
32321 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32322
32323 MVT PVT = getPointerTy(MF->getDataLayout());
32324 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32324, __PRETTY_FUNCTION__))
;
32325
32326 unsigned Op = 0;
32327 unsigned VR = 0;
32328
32329 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
32330 !isPositionIndependent();
32331
32332 if (UseImmLabel) {
32333 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
32334 } else {
32335 const TargetRegisterClass *TRC =
32336 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
32337 VR = MRI->createVirtualRegister(TRC);
32338 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
32339
32340 if (Subtarget.is64Bit())
32341 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
32342 .addReg(X86::RIP)
32343 .addImm(1)
32344 .addReg(0)
32345 .addMBB(DispatchBB)
32346 .addReg(0);
32347 else
32348 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
32349 .addReg(0) /* TII->getGlobalBaseReg(MF) */
32350 .addImm(1)
32351 .addReg(0)
32352 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
32353 .addReg(0);
32354 }
32355
32356 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
32357 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
32358 if (UseImmLabel)
32359 MIB.addMBB(DispatchBB);
32360 else
32361 MIB.addReg(VR);
32362}
32363
32364MachineBasicBlock *
32365X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
32366 MachineBasicBlock *BB) const {
32367 DebugLoc DL = MI.getDebugLoc();
32368 MachineFunction *MF = BB->getParent();
32369 MachineRegisterInfo *MRI = &MF->getRegInfo();
32370 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32371 int FI = MF->getFrameInfo().getFunctionContextIndex();
32372
32373 // Get a mapping of the call site numbers to all of the landing pads they're
32374 // associated with.
32375 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
32376 unsigned MaxCSNum = 0;
32377 for (auto &MBB : *MF) {
32378 if (!MBB.isEHPad())
32379 continue;
32380
32381 MCSymbol *Sym = nullptr;
32382 for (const auto &MI : MBB) {
32383 if (MI.isDebugInstr())
32384 continue;
32385
32386 assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast
<void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32386, __PRETTY_FUNCTION__))
;
32387 Sym = MI.getOperand(0).getMCSymbol();
32388 break;
32389 }
32390
32391 if (!MF->hasCallSiteLandingPad(Sym))
32392 continue;
32393
32394 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
32395 CallSiteNumToLPad[CSI].push_back(&MBB);
32396 MaxCSNum = std::max(MaxCSNum, CSI);
32397 }
32398 }
32399
32400 // Get an ordered list of the machine basic blocks for the jump table.
32401 std::vector<MachineBasicBlock *> LPadList;
32402 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
32403 LPadList.reserve(CallSiteNumToLPad.size());
32404
32405 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
32406 for (auto &LP : CallSiteNumToLPad[CSI]) {
32407 LPadList.push_back(LP);
32408 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
32409 }
32410 }
32411
32412 assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32413, __PRETTY_FUNCTION__))
32413 "No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32413, __PRETTY_FUNCTION__))
;
32414
32415 // Create the MBBs for the dispatch code.
32416
32417 // Shove the dispatch's address into the return slot in the function context.
32418 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
32419 DispatchBB->setIsEHPad(true);
32420
32421 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
32422 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
32423 DispatchBB->addSuccessor(TrapBB);
32424
32425 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
32426 DispatchBB->addSuccessor(DispContBB);
32427
32428 // Insert MBBs.
32429 MF->push_back(DispatchBB);
32430 MF->push_back(DispContBB);
32431 MF->push_back(TrapBB);
32432
32433 // Insert code into the entry block that creates and registers the function
32434 // context.
32435 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
32436
32437 // Create the jump table and associated information
32438 unsigned JTE = getJumpTableEncoding();
32439 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
32440 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
32441
32442 const X86RegisterInfo &RI = TII->getRegisterInfo();
32443 // Add a register mask with no preserved registers. This results in all
32444 // registers being marked as clobbered.
32445 if (RI.hasBasePointer(*MF)) {
32446 const bool FPIs64Bit =
32447 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
32448 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
32449 MFI->setRestoreBasePointer(MF);
32450
32451 Register FP = RI.getFrameRegister(*MF);
32452 Register BP = RI.getBaseRegister();
32453 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
32454 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
32455 MFI->getRestoreBasePointerOffset())
32456 .addRegMask(RI.getNoPreservedMask());
32457 } else {
32458 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
32459 .addRegMask(RI.getNoPreservedMask());
32460 }
32461
32462 // IReg is used as an index in a memory operand and therefore can't be SP
32463 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
32464 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
32465 Subtarget.is64Bit() ? 8 : 4);
32466 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
32467 .addReg(IReg)
32468 .addImm(LPadList.size());
32469 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
32470
32471 if (Subtarget.is64Bit()) {
32472 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
32473 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
32474
32475 // leaq .LJTI0_0(%rip), BReg
32476 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
32477 .addReg(X86::RIP)
32478 .addImm(1)
32479 .addReg(0)
32480 .addJumpTableIndex(MJTI)
32481 .addReg(0);
32482 // movzx IReg64, IReg
32483 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
32484 .addImm(0)
32485 .addReg(IReg)
32486 .addImm(X86::sub_32bit);
32487
32488 switch (JTE) {
32489 case MachineJumpTableInfo::EK_BlockAddress:
32490 // jmpq *(BReg,IReg64,8)
32491 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
32492 .addReg(BReg)
32493 .addImm(8)
32494 .addReg(IReg64)
32495 .addImm(0)
32496 .addReg(0);
32497 break;
32498 case MachineJumpTableInfo::EK_LabelDifference32: {
32499 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
32500 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
32501 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
32502
32503 // movl (BReg,IReg64,4), OReg
32504 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
32505 .addReg(BReg)
32506 .addImm(4)
32507 .addReg(IReg64)
32508 .addImm(0)
32509 .addReg(0);
32510 // movsx OReg64, OReg
32511 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
32512 // addq BReg, OReg64, TReg
32513 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
32514 .addReg(OReg64)
32515 .addReg(BReg);
32516 // jmpq *TReg
32517 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
32518 break;
32519 }
32520 default:
32521 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32521)
;
32522 }
32523 } else {
32524 // jmpl *.LJTI0_0(,IReg,4)
32525 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
32526 .addReg(0)
32527 .addImm(4)
32528 .addReg(IReg)
32529 .addJumpTableIndex(MJTI)
32530 .addReg(0);
32531 }
32532
32533 // Add the jump table entries as successors to the MBB.
32534 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
32535 for (auto &LP : LPadList)
32536 if (SeenMBBs.insert(LP).second)
32537 DispContBB->addSuccessor(LP);
32538
32539 // N.B. the order the invoke BBs are processed in doesn't matter here.
32540 SmallVector<MachineBasicBlock *, 64> MBBLPads;
32541 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
32542 for (MachineBasicBlock *MBB : InvokeBBs) {
32543 // Remove the landing pad successor from the invoke block and replace it
32544 // with the new dispatch block.
32545 // Keep a copy of Successors since it's modified inside the loop.
32546 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
32547 MBB->succ_rend());
32548 // FIXME: Avoid quadratic complexity.
32549 for (auto MBBS : Successors) {
32550 if (MBBS->isEHPad()) {
32551 MBB->removeSuccessor(MBBS);
32552 MBBLPads.push_back(MBBS);
32553 }
32554 }
32555
32556 MBB->addSuccessor(DispatchBB);
32557
32558 // Find the invoke call and mark all of the callee-saved registers as
32559 // 'implicit defined' so that they're spilled. This prevents code from
32560 // moving instructions to before the EH block, where they will never be
32561 // executed.
32562 for (auto &II : reverse(*MBB)) {
32563 if (!II.isCall())
32564 continue;
32565
32566 DenseMap<unsigned, bool> DefRegs;
32567 for (auto &MOp : II.operands())
32568 if (MOp.isReg())
32569 DefRegs[MOp.getReg()] = true;
32570
32571 MachineInstrBuilder MIB(*MF, &II);
32572 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
32573 unsigned Reg = SavedRegs[RegIdx];
32574 if (!DefRegs[Reg])
32575 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
32576 }
32577
32578 break;
32579 }
32580 }
32581
32582 // Mark all former landing pads as non-landing pads. The dispatch is the only
32583 // landing pad now.
32584 for (auto &LP : MBBLPads)
32585 LP->setIsEHPad(false);
32586
32587 // The instruction is gone now.
32588 MI.eraseFromParent();
32589 return BB;
32590}
32591
32592MachineBasicBlock *
32593X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
32594 MachineBasicBlock *BB) const {
32595 MachineFunction *MF = BB->getParent();
32596 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32597 DebugLoc DL = MI.getDebugLoc();
32598
32599 switch (MI.getOpcode()) {
32600 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32600)
;
32601 case X86::TLS_addr32:
32602 case X86::TLS_addr64:
32603 case X86::TLS_base_addr32:
32604 case X86::TLS_base_addr64:
32605 return EmitLoweredTLSAddr(MI, BB);
32606 case X86::RETPOLINE_CALL32:
32607 case X86::RETPOLINE_CALL64:
32608 case X86::RETPOLINE_TCRETURN32:
32609 case X86::RETPOLINE_TCRETURN64:
32610 return EmitLoweredRetpoline(MI, BB);
32611 case X86::CATCHRET:
32612 return EmitLoweredCatchRet(MI, BB);
32613 case X86::SEG_ALLOCA_32:
32614 case X86::SEG_ALLOCA_64:
32615 return EmitLoweredSegAlloca(MI, BB);
32616 case X86::PROBED_ALLOCA_32:
32617 case X86::PROBED_ALLOCA_64:
32618 return EmitLoweredProbedAlloca(MI, BB);
32619 case X86::TLSCall_32:
32620 case X86::TLSCall_64:
32621 return EmitLoweredTLSCall(MI, BB);
32622 case X86::CMOV_FR32:
32623 case X86::CMOV_FR32X:
32624 case X86::CMOV_FR64:
32625 case X86::CMOV_FR64X:
32626 case X86::CMOV_GR8:
32627 case X86::CMOV_GR16:
32628 case X86::CMOV_GR32:
32629 case X86::CMOV_RFP32:
32630 case X86::CMOV_RFP64:
32631 case X86::CMOV_RFP80:
32632 case X86::CMOV_VR64:
32633 case X86::CMOV_VR128:
32634 case X86::CMOV_VR128X:
32635 case X86::CMOV_VR256:
32636 case X86::CMOV_VR256X:
32637 case X86::CMOV_VR512:
32638 case X86::CMOV_VK1:
32639 case X86::CMOV_VK2:
32640 case X86::CMOV_VK4:
32641 case X86::CMOV_VK8:
32642 case X86::CMOV_VK16:
32643 case X86::CMOV_VK32:
32644 case X86::CMOV_VK64:
32645 return EmitLoweredSelect(MI, BB);
32646
32647 case X86::RDFLAGS32:
32648 case X86::RDFLAGS64: {
32649 unsigned PushF =
32650 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
32651 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
32652 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
32653 // Permit reads of the EFLAGS and DF registers without them being defined.
32654 // This intrinsic exists to read external processor state in flags, such as
32655 // the trap flag, interrupt flag, and direction flag, none of which are
32656 // modeled by the backend.
32657 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32658, __PRETTY_FUNCTION__))
32658 "Unexpected register in operand!")((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32658, __PRETTY_FUNCTION__))
;
32659 Push->getOperand(2).setIsUndef();
32660 assert(Push->getOperand(3).getReg() == X86::DF &&((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32661, __PRETTY_FUNCTION__))
32661 "Unexpected register in operand!")((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32661, __PRETTY_FUNCTION__))
;
32662 Push->getOperand(3).setIsUndef();
32663 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
32664
32665 MI.eraseFromParent(); // The pseudo is gone now.
32666 return BB;
32667 }
32668
32669 case X86::WRFLAGS32:
32670 case X86::WRFLAGS64: {
32671 unsigned Push =
32672 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
32673 unsigned PopF =
32674 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
32675 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
32676 BuildMI(*BB, MI, DL, TII->get(PopF));
32677
32678 MI.eraseFromParent(); // The pseudo is gone now.
32679 return BB;
32680 }
32681
32682 case X86::FP32_TO_INT16_IN_MEM:
32683 case X86::FP32_TO_INT32_IN_MEM:
32684 case X86::FP32_TO_INT64_IN_MEM:
32685 case X86::FP64_TO_INT16_IN_MEM:
32686 case X86::FP64_TO_INT32_IN_MEM:
32687 case X86::FP64_TO_INT64_IN_MEM:
32688 case X86::FP80_TO_INT16_IN_MEM:
32689 case X86::FP80_TO_INT32_IN_MEM:
32690 case X86::FP80_TO_INT64_IN_MEM: {
32691 // Change the floating point control register to use "round towards zero"
32692 // mode when truncating to an integer value.
32693 int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
32694 addFrameReference(BuildMI(*BB, MI, DL,
32695 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
32696
32697 // Load the old value of the control word...
32698 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
32699 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
32700 OrigCWFrameIdx);
32701
32702 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
32703 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
32704 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
32705 .addReg(OldCW, RegState::Kill).addImm(0xC00);
32706
32707 // Extract to 16 bits.
32708 Register NewCW16 =
32709 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
32710 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
32711 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
32712
32713 // Prepare memory for FLDCW.
32714 int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
32715 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
32716 NewCWFrameIdx)
32717 .addReg(NewCW16, RegState::Kill);
32718
32719 // Reload the modified control word now...
32720 addFrameReference(BuildMI(*BB, MI, DL,
32721 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
32722
32723 // Get the X86 opcode to use.
32724 unsigned Opc;
32725 switch (MI.getOpcode()) {
32726 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32726)
;
32727 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
32728 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
32729 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
32730 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
32731 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
32732 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
32733 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
32734 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
32735 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
32736 }
32737
32738 X86AddressMode AM = getAddressFromInstr(&MI, 0);
32739 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
32740 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
32741
32742 // Reload the original control word now.
32743 addFrameReference(BuildMI(*BB, MI, DL,
32744 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
32745
32746 MI.eraseFromParent(); // The pseudo instruction is gone now.
32747 return BB;
32748 }
32749
32750 // xbegin
32751 case X86::XBEGIN:
32752 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
32753
32754 case X86::VASTART_SAVE_XMM_REGS:
32755 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
32756
32757 case X86::VAARG_64:
32758 return EmitVAARG64WithCustomInserter(MI, BB);
32759
32760 case X86::EH_SjLj_SetJmp32:
32761 case X86::EH_SjLj_SetJmp64:
32762 return emitEHSjLjSetJmp(MI, BB);
32763
32764 case X86::EH_SjLj_LongJmp32:
32765 case X86::EH_SjLj_LongJmp64:
32766 return emitEHSjLjLongJmp(MI, BB);
32767
32768 case X86::Int_eh_sjlj_setup_dispatch:
32769 return EmitSjLjDispatchBlock(MI, BB);
32770
32771 case TargetOpcode::STATEPOINT:
32772 // As an implementation detail, STATEPOINT shares the STACKMAP format at
32773 // this point in the process. We diverge later.
32774 return emitPatchPoint(MI, BB);
32775
32776 case TargetOpcode::STACKMAP:
32777 case TargetOpcode::PATCHPOINT:
32778 return emitPatchPoint(MI, BB);
32779
32780 case TargetOpcode::PATCHABLE_EVENT_CALL:
32781 return emitXRayCustomEvent(MI, BB);
32782
32783 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
32784 return emitXRayTypedEvent(MI, BB);
32785
32786 case X86::LCMPXCHG8B: {
32787 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
32788 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
32789 // requires a memory operand. If it happens that current architecture is
32790 // i686 and for current function we need a base pointer
32791 // - which is ESI for i686 - register allocator would not be able to
32792 // allocate registers for an address in form of X(%reg, %reg, Y)
32793 // - there never would be enough unreserved registers during regalloc
32794 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
32795 // We are giving a hand to register allocator by precomputing the address in
32796 // a new vreg using LEA.
32797
32798 // If it is not i686 or there is no base pointer - nothing to do here.
32799 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
32800 return BB;
32801
32802 // Even though this code does not necessarily needs the base pointer to
32803 // be ESI, we check for that. The reason: if this assert fails, there are
32804 // some changes happened in the compiler base pointer handling, which most
32805 // probably have to be addressed somehow here.
32806 assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32808, __PRETTY_FUNCTION__))
32807 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32808, __PRETTY_FUNCTION__))
32808 "base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32808, __PRETTY_FUNCTION__))
;
32809
32810 MachineRegisterInfo &MRI = MF->getRegInfo();
32811 MVT SPTy = getPointerTy(MF->getDataLayout());
32812 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
32813 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
32814
32815 X86AddressMode AM = getAddressFromInstr(&MI, 0);
32816 // Regalloc does not need any help when the memory operand of CMPXCHG8B
32817 // does not use index register.
32818 if (AM.IndexReg == X86::NoRegister)
32819 return BB;
32820
32821 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
32822 // four operand definitions that are E[ABCD] registers. We skip them and
32823 // then insert the LEA.
32824 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
32825 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
32826 RMBBI->definesRegister(X86::EBX) ||
32827 RMBBI->definesRegister(X86::ECX) ||
32828 RMBBI->definesRegister(X86::EDX))) {
32829 ++RMBBI;
32830 }
32831 MachineBasicBlock::iterator MBBI(RMBBI);
32832 addFullAddress(
32833 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
32834
32835 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
32836
32837 return BB;
32838 }
32839 case X86::LCMPXCHG16B:
32840 return BB;
32841 case X86::LCMPXCHG8B_SAVE_EBX:
32842 case X86::LCMPXCHG16B_SAVE_RBX: {
32843 unsigned BasePtr =
32844 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
32845 if (!BB->isLiveIn(BasePtr))
32846 BB->addLiveIn(BasePtr);
32847 return BB;
32848 }
32849 }
32850}
32851
32852//===----------------------------------------------------------------------===//
32853// X86 Optimization Hooks
32854//===----------------------------------------------------------------------===//
32855
32856bool
32857X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
32858 const APInt &Demanded,
32859 TargetLoweringOpt &TLO) const {
32860 // Only optimize Ands to prevent shrinking a constant that could be
32861 // matched by movzx.
32862 if (Op.getOpcode() != ISD::AND)
32863 return false;
32864
32865 EVT VT = Op.getValueType();
32866
32867 // Ignore vectors.
32868 if (VT.isVector())
32869 return false;
32870
32871 unsigned Size = VT.getSizeInBits();
32872
32873 // Make sure the RHS really is a constant.
32874 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
32875 if (!C)
32876 return false;
32877
32878 const APInt &Mask = C->getAPIntValue();
32879
32880 // Clear all non-demanded bits initially.
32881 APInt ShrunkMask = Mask & Demanded;
32882
32883 // Find the width of the shrunk mask.
32884 unsigned Width = ShrunkMask.getActiveBits();
32885
32886 // If the mask is all 0s there's nothing to do here.
32887 if (Width == 0)
32888 return false;
32889
32890 // Find the next power of 2 width, rounding up to a byte.
32891 Width = PowerOf2Ceil(std::max(Width, 8U));
32892 // Truncate the width to size to handle illegal types.
32893 Width = std::min(Width, Size);
32894
32895 // Calculate a possible zero extend mask for this constant.
32896 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
32897
32898 // If we aren't changing the mask, just return true to keep it and prevent
32899 // the caller from optimizing.
32900 if (ZeroExtendMask == Mask)
32901 return true;
32902
32903 // Make sure the new mask can be represented by a combination of mask bits
32904 // and non-demanded bits.
32905 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
32906 return false;
32907
32908 // Replace the constant with the zero extend mask.
32909 SDLoc DL(Op);
32910 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
32911 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
32912 return TLO.CombineTo(Op, NewOp);
32913}
32914
32915void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
32916 KnownBits &Known,
32917 const APInt &DemandedElts,
32918 const SelectionDAG &DAG,
32919 unsigned Depth) const {
32920 unsigned BitWidth = Known.getBitWidth();
32921 unsigned Opc = Op.getOpcode();
32922 EVT VT = Op.getValueType();
32923 assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
32924 Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
32925 Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
32926 Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
32927 "Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
32928 " is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32928, __PRETTY_FUNCTION__))
;
32929
32930 Known.resetAll();
32931 switch (Opc) {
32932 default: break;
32933 case X86ISD::SETCC:
32934 Known.Zero.setBitsFrom(1);
32935 break;
32936 case X86ISD::MOVMSK: {
32937 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
32938 Known.Zero.setBitsFrom(NumLoBits);
32939 break;
32940 }
32941 case X86ISD::PEXTRB:
32942 case X86ISD::PEXTRW: {
32943 SDValue Src = Op.getOperand(0);
32944 EVT SrcVT = Src.getValueType();
32945 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
32946 Op.getConstantOperandVal(1));
32947 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
32948 Known = Known.anyextOrTrunc(BitWidth);
32949 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
32950 break;
32951 }
32952 case X86ISD::VSRAI:
32953 case X86ISD::VSHLI:
32954 case X86ISD::VSRLI: {
32955 unsigned ShAmt = Op.getConstantOperandVal(1);
32956 if (ShAmt >= VT.getScalarSizeInBits()) {
32957 Known.setAllZero();
32958 break;
32959 }
32960
32961 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
32962 if (Opc == X86ISD::VSHLI) {
32963 Known.Zero <<= ShAmt;
32964 Known.One <<= ShAmt;
32965 // Low bits are known zero.
32966 Known.Zero.setLowBits(ShAmt);
32967 } else if (Opc == X86ISD::VSRLI) {
32968 Known.Zero.lshrInPlace(ShAmt);
32969 Known.One.lshrInPlace(ShAmt);
32970 // High bits are known zero.
32971 Known.Zero.setHighBits(ShAmt);
32972 } else {
32973 Known.Zero.ashrInPlace(ShAmt);
32974 Known.One.ashrInPlace(ShAmt);
32975 }
32976 break;
32977 }
32978 case X86ISD::PACKUS: {
32979 // PACKUS is just a truncation if the upper half is zero.
32980 APInt DemandedLHS, DemandedRHS;
32981 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
32982
32983 Known.One = APInt::getAllOnesValue(BitWidth * 2);
32984 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
32985
32986 KnownBits Known2;
32987 if (!!DemandedLHS) {
32988 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
32989 Known.One &= Known2.One;
32990 Known.Zero &= Known2.Zero;
32991 }
32992 if (!!DemandedRHS) {
32993 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
32994 Known.One &= Known2.One;
32995 Known.Zero &= Known2.Zero;
32996 }
32997
32998 if (Known.countMinLeadingZeros() < BitWidth)
32999 Known.resetAll();
33000 Known = Known.trunc(BitWidth);
33001 break;
33002 }
33003 case X86ISD::ANDNP: {
33004 KnownBits Known2;
33005 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
33006 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
33007
33008 // ANDNP = (~X & Y);
33009 Known.One &= Known2.Zero;
33010 Known.Zero |= Known2.One;
33011 break;
33012 }
33013 case X86ISD::FOR: {
33014 KnownBits Known2;
33015 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
33016 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
33017
33018 // Output known-0 bits are only known if clear in both the LHS & RHS.
33019 Known.Zero &= Known2.Zero;
33020 // Output known-1 are known to be set if set in either the LHS | RHS.
33021 Known.One |= Known2.One;
33022 break;
33023 }
33024 case X86ISD::PSADBW: {
33025 assert(VT.getScalarType() == MVT::i64 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33027, __PRETTY_FUNCTION__))
33026 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33027, __PRETTY_FUNCTION__))
33027 "Unexpected PSADBW types")((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33027, __PRETTY_FUNCTION__))
;
33028
33029 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
33030 Known.Zero.setBitsFrom(16);
33031 break;
33032 }
33033 case X86ISD::CMOV: {
33034 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
33035 // If we don't know any bits, early out.
33036 if (Known.isUnknown())
33037 break;
33038 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
33039
33040 // Only known if known in both the LHS and RHS.
33041 Known.One &= Known2.One;
33042 Known.Zero &= Known2.Zero;
33043 break;
33044 }
33045 case X86ISD::BEXTR: {
33046 SDValue Op0 = Op.getOperand(0);
33047 SDValue Op1 = Op.getOperand(1);
33048
33049 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
33050 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
33051 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
33052
33053 // If the length is 0, the result is 0.
33054 if (Length == 0) {
33055 Known.setAllZero();
33056 break;
33057 }
33058
33059 if ((Shift + Length) <= BitWidth) {
33060 Known = DAG.computeKnownBits(Op0, Depth + 1);
33061 Known = Known.extractBits(Length, Shift);
33062 Known = Known.zextOrTrunc(BitWidth);
33063 }
33064 }
33065 break;
33066 }
33067 }
33068
33069 // Handle target shuffles.
33070 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
33071 if (isTargetShuffle(Opc)) {
33072 bool IsUnary;
33073 SmallVector<int, 64> Mask;
33074 SmallVector<SDValue, 2> Ops;
33075 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
33076 IsUnary)) {
33077 unsigned NumOps = Ops.size();
33078 unsigned NumElts = VT.getVectorNumElements();
33079 if (Mask.size() == NumElts) {
33080 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
33081 Known.Zero.setAllBits(); Known.One.setAllBits();
33082 for (unsigned i = 0; i != NumElts; ++i) {
33083 if (!DemandedElts[i])
33084 continue;
33085 int M = Mask[i];
33086 if (M == SM_SentinelUndef) {
33087 // For UNDEF elements, we don't know anything about the common state
33088 // of the shuffle result.
33089 Known.resetAll();
33090 break;
33091 } else if (M == SM_SentinelZero) {
33092 Known.One.clearAllBits();
33093 continue;
33094 }
33095 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33096, __PRETTY_FUNCTION__))
33096 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33096, __PRETTY_FUNCTION__))
;
33097
33098 unsigned OpIdx = (unsigned)M / NumElts;
33099 unsigned EltIdx = (unsigned)M % NumElts;
33100 if (Ops[OpIdx].getValueType() != VT) {
33101 // TODO - handle target shuffle ops with different value types.
33102 Known.resetAll();
33103 break;
33104 }
33105 DemandedOps[OpIdx].setBit(EltIdx);
33106 }
33107 // Known bits are the values that are shared by every demanded element.
33108 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
33109 if (!DemandedOps[i])
33110 continue;
33111 KnownBits Known2 =
33112 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
33113 Known.One &= Known2.One;
33114 Known.Zero &= Known2.Zero;
33115 }
33116 }
33117 }
33118 }
33119}
33120
33121unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
33122 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
33123 unsigned Depth) const {
33124 EVT VT = Op.getValueType();
33125 unsigned VTBits = VT.getScalarSizeInBits();
33126 unsigned Opcode = Op.getOpcode();
33127 switch (Opcode) {
33128 case X86ISD::SETCC_CARRY:
33129 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
33130 return VTBits;
33131
33132 case X86ISD::VTRUNC: {
33133 // TODO: Add DemandedElts support.
33134 SDValue Src = Op.getOperand(0);
33135 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
33136 assert(VTBits < NumSrcBits && "Illegal truncation input type")((VTBits < NumSrcBits && "Illegal truncation input type"
) ? static_cast<void> (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33136, __PRETTY_FUNCTION__))
;
33137 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
33138 if (Tmp > (NumSrcBits - VTBits))
33139 return Tmp - (NumSrcBits - VTBits);
33140 return 1;
33141 }
33142
33143 case X86ISD::PACKSS: {
33144 // PACKSS is just a truncation if the sign bits extend to the packed size.
33145 APInt DemandedLHS, DemandedRHS;
33146 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
33147 DemandedRHS);
33148
33149 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
33150 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
33151 if (!!DemandedLHS)
33152 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
33153 if (!!DemandedRHS)
33154 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
33155 unsigned Tmp = std::min(Tmp0, Tmp1);
33156 if (Tmp > (SrcBits - VTBits))
33157 return Tmp - (SrcBits - VTBits);
33158 return 1;
33159 }
33160
33161 case X86ISD::VSHLI: {
33162 SDValue Src = Op.getOperand(0);
33163 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
33164 if (ShiftVal.uge(VTBits))
33165 return VTBits; // Shifted all bits out --> zero.
33166 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
33167 if (ShiftVal.uge(Tmp))
33168 return 1; // Shifted all sign bits out --> unknown.
33169 return Tmp - ShiftVal.getZExtValue();
33170 }
33171
33172 case X86ISD::VSRAI: {
33173 SDValue Src = Op.getOperand(0);
33174 APInt ShiftVal = Op.getConstantOperandAPInt(1);
33175 if (ShiftVal.uge(VTBits - 1))
33176 return VTBits; // Sign splat.
33177 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
33178 ShiftVal += Tmp;
33179 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
33180 }
33181
33182 case X86ISD::PCMPGT:
33183 case X86ISD::PCMPEQ:
33184 case X86ISD::CMPP:
33185 case X86ISD::VPCOM:
33186 case X86ISD::VPCOMU:
33187 // Vector compares return zero/all-bits result values.
33188 return VTBits;
33189
33190 case X86ISD::ANDNP: {
33191 unsigned Tmp0 =
33192 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
33193 if (Tmp0 == 1) return 1; // Early out.
33194 unsigned Tmp1 =
33195 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
33196 return std::min(Tmp0, Tmp1);
33197 }
33198
33199 case X86ISD::CMOV: {
33200 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
33201 if (Tmp0 == 1) return 1; // Early out.
33202 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
33203 return std::min(Tmp0, Tmp1);
33204 }
33205 }
33206
33207 // Handle target shuffles.
33208 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
33209 if (isTargetShuffle(Opcode)) {
33210 bool IsUnary;
33211 SmallVector<int, 64> Mask;
33212 SmallVector<SDValue, 2> Ops;
33213 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
33214 IsUnary)) {
33215 unsigned NumOps = Ops.size();
33216 unsigned NumElts = VT.getVectorNumElements();
33217 if (Mask.size() == NumElts) {
33218 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
33219 for (unsigned i = 0; i != NumElts; ++i) {
33220 if (!DemandedElts[i])
33221 continue;
33222 int M = Mask[i];
33223 if (M == SM_SentinelUndef) {
33224 // For UNDEF elements, we don't know anything about the common state
33225 // of the shuffle result.
33226 return 1;
33227 } else if (M == SM_SentinelZero) {
33228 // Zero = all sign bits.
33229 continue;
33230 }
33231 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33232, __PRETTY_FUNCTION__))
33232 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33232, __PRETTY_FUNCTION__))
;
33233
33234 unsigned OpIdx = (unsigned)M / NumElts;
33235 unsigned EltIdx = (unsigned)M % NumElts;
33236 if (Ops[OpIdx].getValueType() != VT) {
33237 // TODO - handle target shuffle ops with different value types.
33238 return 1;
33239 }
33240 DemandedOps[OpIdx].setBit(EltIdx);
33241 }
33242 unsigned Tmp0 = VTBits;
33243 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
33244 if (!DemandedOps[i])
33245 continue;
33246 unsigned Tmp1 =
33247 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
33248 Tmp0 = std::min(Tmp0, Tmp1);
33249 }
33250 return Tmp0;
33251 }
33252 }
33253 }
33254
33255 // Fallback case.
33256 return 1;
33257}
33258
33259SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
33260 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
33261 return N->getOperand(0);
33262 return N;
33263}
33264
33265// Attempt to match a combined shuffle mask against supported unary shuffle
33266// instructions.
33267// TODO: Investigate sharing more of this with shuffle lowering.
33268static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
33269 bool AllowFloatDomain, bool AllowIntDomain,
33270 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
33271 const X86Subtarget &Subtarget, unsigned &Shuffle,
33272 MVT &SrcVT, MVT &DstVT) {
33273 unsigned NumMaskElts = Mask.size();
33274 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
33275
33276 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
33277 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
33278 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
33279 Shuffle = X86ISD::VZEXT_MOVL;
33280 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
33281 return true;
33282 }
33283
33284 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
33285 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
33286 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
33287 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
33288 unsigned MaxScale = 64 / MaskEltSize;
33289 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
33290 bool MatchAny = true;
33291 bool MatchZero = true;
33292 unsigned NumDstElts = NumMaskElts / Scale;
33293 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
33294 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
33295 MatchAny = MatchZero = false;
33296 break;
33297 }
33298 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
33299 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
33300 }
33301 if (MatchAny || MatchZero) {
33302 assert(MatchZero && "Failed to match zext but matched aext?")((MatchZero && "Failed to match zext but matched aext?"
) ? static_cast<void> (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33302, __PRETTY_FUNCTION__))
;
33303 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
33304 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
33305 MVT::getIntegerVT(MaskEltSize);
33306 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
33307
33308 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
33309 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
33310
33311 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
33312 if (SrcVT.getVectorNumElements() != NumDstElts)
33313 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
33314
33315 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
33316 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
33317 return true;
33318 }
33319 }
33320 }
33321
33322 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
33323 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
33324 isUndefOrEqual(Mask[0], 0) &&
33325 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
33326 Shuffle = X86ISD::VZEXT_MOVL;
33327 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
33328 return true;
33329 }
33330
33331 // Check if we have SSE3 which will let us use MOVDDUP etc. The
33332 // instructions are no slower than UNPCKLPD but has the option to
33333 // fold the input operand into even an unaligned memory load.
33334 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
33335 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
33336 Shuffle = X86ISD::MOVDDUP;
33337 SrcVT = DstVT = MVT::v2f64;
33338 return true;
33339 }
33340 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
33341 Shuffle = X86ISD::MOVSLDUP;
33342 SrcVT = DstVT = MVT::v4f32;
33343 return true;
33344 }
33345 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
33346 Shuffle = X86ISD::MOVSHDUP;
33347 SrcVT = DstVT = MVT::v4f32;
33348 return true;
33349 }
33350 }
33351
33352 if (MaskVT.is256BitVector() && AllowFloatDomain) {
33353 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33353, __PRETTY_FUNCTION__))
;
33354 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
33355 Shuffle = X86ISD::MOVDDUP;
33356 SrcVT = DstVT = MVT::v4f64;
33357 return true;
33358 }
33359 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
33360 Shuffle = X86ISD::MOVSLDUP;
33361 SrcVT = DstVT = MVT::v8f32;
33362 return true;
33363 }
33364 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
33365 Shuffle = X86ISD::MOVSHDUP;
33366 SrcVT = DstVT = MVT::v8f32;
33367 return true;
33368 }
33369 }
33370
33371 if (MaskVT.is512BitVector() && AllowFloatDomain) {
33372 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33373, __PRETTY_FUNCTION__))
33373 "AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33373, __PRETTY_FUNCTION__))
;
33374 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
33375 Shuffle = X86ISD::MOVDDUP;
33376 SrcVT = DstVT = MVT::v8f64;
33377 return true;
33378 }
33379 if (isTargetShuffleEquivalent(
33380 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
33381 Shuffle = X86ISD::MOVSLDUP;
33382 SrcVT = DstVT = MVT::v16f32;
33383 return true;
33384 }
33385 if (isTargetShuffleEquivalent(
33386 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
33387 Shuffle = X86ISD::MOVSHDUP;
33388 SrcVT = DstVT = MVT::v16f32;
33389 return true;
33390 }
33391 }
33392
33393 return false;
33394}
33395
33396// Attempt to match a combined shuffle mask against supported unary immediate
33397// permute instructions.
33398// TODO: Investigate sharing more of this with shuffle lowering.
33399static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
33400 const APInt &Zeroable,
33401 bool AllowFloatDomain, bool AllowIntDomain,
33402 const X86Subtarget &Subtarget,
33403 unsigned &Shuffle, MVT &ShuffleVT,
33404 unsigned &PermuteImm) {
33405 unsigned NumMaskElts = Mask.size();
33406 unsigned InputSizeInBits = MaskVT.getSizeInBits();
33407 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
33408 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
33409
33410 bool ContainsZeros =
33411 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
33412
33413 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
33414 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
33415 // Check for lane crossing permutes.
33416 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
33417 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
33418 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
33419 Shuffle = X86ISD::VPERMI;
33420 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
33421 PermuteImm = getV4X86ShuffleImm(Mask);
33422 return true;
33423 }
33424 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
33425 SmallVector<int, 4> RepeatedMask;
33426 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
33427 Shuffle = X86ISD::VPERMI;
33428 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
33429 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
33430 return true;
33431 }
33432 }
33433 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
33434 // VPERMILPD can permute with a non-repeating shuffle.
33435 Shuffle = X86ISD::VPERMILPI;
33436 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
33437 PermuteImm = 0;
33438 for (int i = 0, e = Mask.size(); i != e; ++i) {
33439 int M = Mask[i];
33440 if (M == SM_SentinelUndef)
33441 continue;
33442 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33442, __PRETTY_FUNCTION__))
;
33443 PermuteImm |= (M & 1) << i;
33444 }
33445 return true;
33446 }
33447 }
33448
33449 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
33450 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
33451 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
33452 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
33453 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
33454 SmallVector<int, 4> RepeatedMask;
33455 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
33456 // Narrow the repeated mask to create 32-bit element permutes.
33457 SmallVector<int, 4> WordMask = RepeatedMask;
33458 if (MaskScalarSizeInBits == 64)
33459 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
33460
33461 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
33462 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
33463 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
33464 PermuteImm = getV4X86ShuffleImm(WordMask);
33465 return true;
33466 }
33467 }
33468
33469 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
33470 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
33471 SmallVector<int, 4> RepeatedMask;
33472 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
33473 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
33474 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
33475
33476 // PSHUFLW: permute lower 4 elements only.
33477 if (isUndefOrInRange(LoMask, 0, 4) &&
33478 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
33479 Shuffle = X86ISD::PSHUFLW;
33480 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
33481 PermuteImm = getV4X86ShuffleImm(LoMask);
33482 return true;
33483 }
33484
33485 // PSHUFHW: permute upper 4 elements only.
33486 if (isUndefOrInRange(HiMask, 4, 8) &&
33487 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
33488 // Offset the HiMask so that we can create the shuffle immediate.
33489 int OffsetHiMask[4];
33490 for (int i = 0; i != 4; ++i)
33491 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
33492
33493 Shuffle = X86ISD::PSHUFHW;
33494 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
33495 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
33496 return true;
33497 }
33498 }
33499 }
33500
33501 // Attempt to match against byte/bit shifts.
33502 if (AllowIntDomain &&
33503 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33504 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33505 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33506 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
33507 Mask, 0, Zeroable, Subtarget);
33508 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
33509 32 <= ShuffleVT.getScalarSizeInBits())) {
33510 PermuteImm = (unsigned)ShiftAmt;
33511 return true;
33512 }
33513 }
33514
33515 // Attempt to match against bit rotates.
33516 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
33517 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
33518 Subtarget.hasAVX512())) {
33519 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
33520 Subtarget, Mask);
33521 if (0 < RotateAmt) {
33522 Shuffle = X86ISD::VROTLI;
33523 PermuteImm = (unsigned)RotateAmt;
33524 return true;
33525 }
33526 }
33527
33528 return false;
33529}
33530
33531// Attempt to match a combined unary shuffle mask against supported binary
33532// shuffle instructions.
33533// TODO: Investigate sharing more of this with shuffle lowering.
33534static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
33535 bool AllowFloatDomain, bool AllowIntDomain,
33536 SDValue &V1, SDValue &V2, const SDLoc &DL,
33537 SelectionDAG &DAG, const X86Subtarget &Subtarget,
33538 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
33539 bool IsUnary) {
33540 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
33541
33542 if (MaskVT.is128BitVector()) {
33543 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
33544 V2 = V1;
33545 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
33546 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
33547 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
33548 return true;
33549 }
33550 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
33551 V2 = V1;
33552 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
33553 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
33554 return true;
33555 }
33556 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
33557 (AllowFloatDomain || !Subtarget.hasSSE41())) {
33558 std::swap(V1, V2);
33559 Shuffle = X86ISD::MOVSD;
33560 SrcVT = DstVT = MVT::v2f64;
33561 return true;
33562 }
33563 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
33564 (AllowFloatDomain || !Subtarget.hasSSE41())) {
33565 Shuffle = X86ISD::MOVSS;
33566 SrcVT = DstVT = MVT::v4f32;
33567 return true;
33568 }
33569 }
33570
33571 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
33572 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
33573 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
33574 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
33575 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
33576 Subtarget)) {
33577 DstVT = MaskVT;
33578 return true;
33579 }
33580 }
33581
33582 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
33583 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
33584 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33585 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
33586 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33587 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
33588 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
33589 Subtarget)) {
33590 SrcVT = DstVT = MaskVT;
33591 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
33592 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
33593 return true;
33594 }
33595 }
33596
33597 return false;
33598}
33599
33600static bool matchBinaryPermuteShuffle(
33601 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
33602 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
33603 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
33604 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
33605 unsigned NumMaskElts = Mask.size();
33606 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
33607
33608 // Attempt to match against PALIGNR byte rotate.
33609 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
33610 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33611 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
33612 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
33613 if (0 < ByteRotation) {
33614 Shuffle = X86ISD::PALIGNR;
33615 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
33616 PermuteImm = ByteRotation;
33617 return true;
33618 }
33619 }
33620
33621 // Attempt to combine to X86ISD::BLENDI.
33622 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
33623 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
33624 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
33625 uint64_t BlendMask = 0;
33626 bool ForceV1Zero = false, ForceV2Zero = false;
33627 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
33628 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
33629 ForceV2Zero, BlendMask)) {
33630 if (MaskVT == MVT::v16i16) {
33631 // We can only use v16i16 PBLENDW if the lanes are repeated.
33632 SmallVector<int, 8> RepeatedMask;
33633 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
33634 RepeatedMask)) {
33635 assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33636, __PRETTY_FUNCTION__))
33636 "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33636, __PRETTY_FUNCTION__))
;
33637 PermuteImm = 0;
33638 for (int i = 0; i < 8; ++i)
33639 if (RepeatedMask[i] >= 8)
33640 PermuteImm |= 1 << i;
33641 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33642 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33643 Shuffle = X86ISD::BLENDI;
33644 ShuffleVT = MaskVT;
33645 return true;
33646 }
33647 } else {
33648 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33649 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33650 PermuteImm = (unsigned)BlendMask;
33651 Shuffle = X86ISD::BLENDI;
33652 ShuffleVT = MaskVT;
33653 return true;
33654 }
33655 }
33656 }
33657
33658 // Attempt to combine to INSERTPS, but only if it has elements that need to
33659 // be set to zero.
33660 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
33661 MaskVT.is128BitVector() &&
33662 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
33663 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
33664 Shuffle = X86ISD::INSERTPS;
33665 ShuffleVT = MVT::v4f32;
33666 return true;
33667 }
33668
33669 // Attempt to combine to SHUFPD.
33670 if (AllowFloatDomain && EltSizeInBits == 64 &&
33671 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33672 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
33673 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33674 bool ForceV1Zero = false, ForceV2Zero = false;
33675 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
33676 PermuteImm, Mask, Zeroable)) {
33677 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33678 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33679 Shuffle = X86ISD::SHUFP;
33680 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
33681 return true;
33682 }
33683 }
33684
33685 // Attempt to combine to SHUFPS.
33686 if (AllowFloatDomain && EltSizeInBits == 32 &&
33687 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
33688 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
33689 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33690 SmallVector<int, 4> RepeatedMask;
33691 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
33692 // Match each half of the repeated mask, to determine if its just
33693 // referencing one of the vectors, is zeroable or entirely undef.
33694 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
33695 int M0 = RepeatedMask[Offset];
33696 int M1 = RepeatedMask[Offset + 1];
33697
33698 if (isUndefInRange(RepeatedMask, Offset, 2)) {
33699 return DAG.getUNDEF(MaskVT);
33700 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
33701 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
33702 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
33703 return getZeroVector(MaskVT, Subtarget, DAG, DL);
33704 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
33705 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
33706 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
33707 return V1;
33708 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
33709 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
33710 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
33711 return V2;
33712 }
33713
33714 return SDValue();
33715 };
33716
33717 int ShufMask[4] = {-1, -1, -1, -1};
33718 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
33719 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
33720
33721 if (Lo && Hi) {
33722 V1 = Lo;
33723 V2 = Hi;
33724 Shuffle = X86ISD::SHUFP;
33725 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
33726 PermuteImm = getV4X86ShuffleImm(ShufMask);
33727 return true;
33728 }
33729 }
33730 }
33731
33732 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
33733 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
33734 MaskVT.is128BitVector() &&
33735 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
33736 Shuffle = X86ISD::INSERTPS;
33737 ShuffleVT = MVT::v4f32;
33738 return true;
33739 }
33740
33741 return false;
33742}
33743
33744static SDValue combineX86ShuffleChainWithExtract(
33745 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
33746 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
33747 const X86Subtarget &Subtarget);
33748
33749/// Combine an arbitrary chain of shuffles into a single instruction if
33750/// possible.
33751///
33752/// This is the leaf of the recursive combine below. When we have found some
33753/// chain of single-use x86 shuffle instructions and accumulated the combined
33754/// shuffle mask represented by them, this will try to pattern match that mask
33755/// into either a single instruction if there is a special purpose instruction
33756/// for this operation, or into a PSHUFB instruction which is a fully general
33757/// instruction but should only be used to replace chains over a certain depth.
33758static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
33759 ArrayRef<int> BaseMask, int Depth,
33760 bool HasVariableMask,
33761 bool AllowVariableMask, SelectionDAG &DAG,
33762 const X86Subtarget &Subtarget) {
33763 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33763, __PRETTY_FUNCTION__))
;
33764 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33765, __PRETTY_FUNCTION__))
33765 "Unexpected number of shuffle inputs!")(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33765, __PRETTY_FUNCTION__))
;
33766
33767 // Find the inputs that enter the chain. Note that multiple uses are OK
33768 // here, we're not going to remove the operands we find.
33769 bool UnaryShuffle = (Inputs.size() == 1);
33770 SDValue V1 = peekThroughBitcasts(Inputs[0]);
33771 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
33772 : peekThroughBitcasts(Inputs[1]));
33773
33774 MVT VT1 = V1.getSimpleValueType();
33775 MVT VT2 = V2.getSimpleValueType();
33776 MVT RootVT = Root.getSimpleValueType();
33777 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33779, __PRETTY_FUNCTION__))
33778 VT2.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33779, __PRETTY_FUNCTION__))
33779 "Vector size mismatch")((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33779, __PRETTY_FUNCTION__))
;
33780
33781 SDLoc DL(Root);
33782 SDValue Res;
33783
33784 unsigned NumBaseMaskElts = BaseMask.size();
33785 if (NumBaseMaskElts == 1) {
33786 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!")
? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33786, __PRETTY_FUNCTION__))
;
33787 return DAG.getBitcast(RootVT, V1);
33788 }
33789
33790 unsigned RootSizeInBits = RootVT.getSizeInBits();
33791 unsigned NumRootElts = RootVT.getVectorNumElements();
33792 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
33793 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
33794 (RootVT.isFloatingPoint() && Depth >= 1) ||
33795 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
33796
33797 // Don't combine if we are a AVX512/EVEX target and the mask element size
33798 // is different from the root element size - this would prevent writemasks
33799 // from being reused.
33800 // TODO - this currently prevents all lane shuffles from occurring.
33801 // TODO - check for writemasks usage instead of always preventing combining.
33802 // TODO - attempt to narrow Mask back to writemask size.
33803 bool IsEVEXShuffle =
33804 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
33805
33806 // Attempt to match a subvector broadcast.
33807 // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
33808 if (UnaryShuffle &&
33809 (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
33810 SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
33811 if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
33812 SDValue Src = Inputs[0];
33813 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
33814 Src.getOperand(0).isUndef() &&
33815 Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
33816 MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
33817 return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
33818 Src.getValueType(),
33819 Src.getOperand(1)));
33820 }
33821 }
33822 }
33823
33824 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
33825
33826 // Handle 128-bit lane shuffles of 256-bit vectors.
33827 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
33828 // we need to use the zeroing feature.
33829 // TODO - this should support binary shuffles.
33830 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
33831 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
33832 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
33833 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
33834 return SDValue(); // Nothing to do!
33835 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
33836 unsigned PermMask = 0;
33837 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
33838 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
33839
33840 Res = DAG.getBitcast(ShuffleVT, V1);
33841 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
33842 DAG.getUNDEF(ShuffleVT),
33843 DAG.getTargetConstant(PermMask, DL, MVT::i8));
33844 return DAG.getBitcast(RootVT, Res);
33845 }
33846
33847 // For masks that have been widened to 128-bit elements or more,
33848 // narrow back down to 64-bit elements.
33849 SmallVector<int, 64> Mask;
33850 if (BaseMaskEltSizeInBits > 64) {
33851 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33851, __PRETTY_FUNCTION__))
;
33852 int MaskScale = BaseMaskEltSizeInBits / 64;
33853 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
33854 } else {
33855 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
33856 }
33857
33858 unsigned NumMaskElts = Mask.size();
33859 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
33860
33861 // Determine the effective mask value type.
33862 FloatDomain &= (32 <= MaskEltSizeInBits);
33863 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
33864 : MVT::getIntegerVT(MaskEltSizeInBits);
33865 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
33866
33867 // Only allow legal mask types.
33868 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
33869 return SDValue();
33870
33871 // Attempt to match the mask against known shuffle patterns.
33872 MVT ShuffleSrcVT, ShuffleVT;
33873 unsigned Shuffle, PermuteImm;
33874
33875 // Which shuffle domains are permitted?
33876 // Permit domain crossing at higher combine depths.
33877 // TODO: Should we indicate which domain is preferred if both are allowed?
33878 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
33879 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
33880 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
33881
33882 // Determine zeroable mask elements.
33883 APInt KnownUndef, KnownZero;
33884 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
33885 APInt Zeroable = KnownUndef | KnownZero;
33886
33887 if (UnaryShuffle) {
33888 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
33889 // directly if we don't shuffle the lower element and we shuffle the upper
33890 // (zero) elements within themselves.
33891 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
33892 (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
33893 MaskEltSizeInBits) == 0) {
33894 unsigned Scale =
33895 cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
33896 MaskEltSizeInBits;
33897 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
33898 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
33899 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
33900 return DAG.getBitcast(RootVT, V1);
33901 }
33902 }
33903
33904 // Attempt to match against broadcast-from-vector.
33905 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
33906 if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
33907 && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
33908 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
33909 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
33910 if (V1.getValueType() == MaskVT &&
33911 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33912 MayFoldLoad(V1.getOperand(0))) {
33913 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
33914 return SDValue(); // Nothing to do!
33915 Res = V1.getOperand(0);
33916 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
33917 return DAG.getBitcast(RootVT, Res);
33918 }
33919 if (Subtarget.hasAVX2()) {
33920 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
33921 return SDValue(); // Nothing to do!
33922 Res = DAG.getBitcast(MaskVT, V1);
33923 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
33924 return DAG.getBitcast(RootVT, Res);
33925 }
33926 }
33927 }
33928
33929 SDValue NewV1 = V1; // Save operand in case early exit happens.
33930 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
33931 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
33932 ShuffleVT) &&
33933 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33934 if (Depth == 0 && Root.getOpcode() == Shuffle)
33935 return SDValue(); // Nothing to do!
33936 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
33937 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
33938 return DAG.getBitcast(RootVT, Res);
33939 }
33940
33941 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
33942 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
33943 PermuteImm) &&
33944 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33945 if (Depth == 0 && Root.getOpcode() == Shuffle)
33946 return SDValue(); // Nothing to do!
33947 Res = DAG.getBitcast(ShuffleVT, V1);
33948 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
33949 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
33950 return DAG.getBitcast(RootVT, Res);
33951 }
33952 }
33953
33954 SDValue NewV1 = V1; // Save operands in case early exit happens.
33955 SDValue NewV2 = V2;
33956 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
33957 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
33958 ShuffleVT, UnaryShuffle) &&
33959 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33960 if (Depth == 0 && Root.getOpcode() == Shuffle)
33961 return SDValue(); // Nothing to do!
33962 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
33963 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
33964 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
33965 return DAG.getBitcast(RootVT, Res);
33966 }
33967
33968 NewV1 = V1; // Save operands in case early exit happens.
33969 NewV2 = V2;
33970 if (matchBinaryPermuteShuffle(
33971 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
33972 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
33973 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33974 if (Depth == 0 && Root.getOpcode() == Shuffle)
33975 return SDValue(); // Nothing to do!
33976 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
33977 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
33978 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
33979 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
33980 return DAG.getBitcast(RootVT, Res);
33981 }
33982
33983 // Typically from here on, we need an integer version of MaskVT.
33984 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
33985 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
33986
33987 // Annoyingly, SSE4A instructions don't map into the above match helpers.
33988 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
33989 uint64_t BitLen, BitIdx;
33990 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
33991 Zeroable)) {
33992 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
33993 return SDValue(); // Nothing to do!
33994 V1 = DAG.getBitcast(IntMaskVT, V1);
33995 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
33996 DAG.getTargetConstant(BitLen, DL, MVT::i8),
33997 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
33998 return DAG.getBitcast(RootVT, Res);
33999 }
34000
34001 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
34002 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
34003 return SDValue(); // Nothing to do!
34004 V1 = DAG.getBitcast(IntMaskVT, V1);
34005 V2 = DAG.getBitcast(IntMaskVT, V2);
34006 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
34007 DAG.getTargetConstant(BitLen, DL, MVT::i8),
34008 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
34009 return DAG.getBitcast(RootVT, Res);
34010 }
34011 }
34012
34013 // Don't try to re-form single instruction chains under any circumstances now
34014 // that we've done encoding canonicalization for them.
34015 if (Depth < 1)
34016 return SDValue();
34017
34018 // Depth threshold above which we can efficiently use variable mask shuffles.
34019 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
34020 AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
34021
34022 bool MaskContainsZeros =
34023 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
34024
34025 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
34026 // If we have a single input lane-crossing shuffle then lower to VPERMV.
34027 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34028 ((Subtarget.hasAVX2() &&
34029 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34030 (Subtarget.hasAVX512() &&
34031 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34032 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34033 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34034 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34035 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34036 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34037 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34038 Res = DAG.getBitcast(MaskVT, V1);
34039 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
34040 return DAG.getBitcast(RootVT, Res);
34041 }
34042
34043 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
34044 // vector as the second source.
34045 if (UnaryShuffle && AllowVariableMask &&
34046 ((Subtarget.hasAVX512() &&
34047 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34048 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34049 (Subtarget.hasVLX() &&
34050 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
34051 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34052 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34053 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34054 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34055 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34056 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
34057 for (unsigned i = 0; i != NumMaskElts; ++i)
34058 if (Mask[i] == SM_SentinelZero)
34059 Mask[i] = NumMaskElts + i;
34060
34061 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34062 Res = DAG.getBitcast(MaskVT, V1);
34063 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
34064 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
34065 return DAG.getBitcast(RootVT, Res);
34066 }
34067
34068 // If that failed and either input is extracted then try to combine as a
34069 // shuffle with the larger type.
34070 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
34071 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
34072 DAG, Subtarget))
34073 return WideShuffle;
34074
34075 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
34076 if (AllowVariableMask && !MaskContainsZeros &&
34077 ((Subtarget.hasAVX512() &&
34078 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34079 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34080 (Subtarget.hasVLX() &&
34081 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
34082 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34083 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34084 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34085 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34086 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34087 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34088 V1 = DAG.getBitcast(MaskVT, V1);
34089 V2 = DAG.getBitcast(MaskVT, V2);
34090 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
34091 return DAG.getBitcast(RootVT, Res);
34092 }
34093 return SDValue();
34094 }
34095
34096 // See if we can combine a single input shuffle with zeros to a bit-mask,
34097 // which is much simpler than any shuffle.
34098 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
34099 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
34100 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
34101 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
34102 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
34103 APInt UndefElts(NumMaskElts, 0);
34104 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
34105 for (unsigned i = 0; i != NumMaskElts; ++i) {
34106 int M = Mask[i];
34107 if (M == SM_SentinelUndef) {
34108 UndefElts.setBit(i);
34109 continue;
34110 }
34111 if (M == SM_SentinelZero)
34112 continue;
34113 EltBits[i] = AllOnes;
34114 }
34115 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
34116 Res = DAG.getBitcast(MaskVT, V1);
34117 unsigned AndOpcode =
34118 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
34119 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
34120 return DAG.getBitcast(RootVT, Res);
34121 }
34122
34123 // If we have a single input shuffle with different shuffle patterns in the
34124 // the 128-bit lanes use the variable mask to VPERMILPS.
34125 // TODO Combine other mask types at higher depths.
34126 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34127 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
34128 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
34129 SmallVector<SDValue, 16> VPermIdx;
34130 for (int M : Mask) {
34131 SDValue Idx =
34132 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
34133 VPermIdx.push_back(Idx);
34134 }
34135 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
34136 Res = DAG.getBitcast(MaskVT, V1);
34137 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
34138 return DAG.getBitcast(RootVT, Res);
34139 }
34140
34141 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
34142 // to VPERMIL2PD/VPERMIL2PS.
34143 if (AllowVariableMask && Subtarget.hasXOP() &&
34144 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
34145 MaskVT == MVT::v8f32)) {
34146 // VPERMIL2 Operation.
34147 // Bits[3] - Match Bit.
34148 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
34149 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
34150 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
34151 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
34152 SmallVector<int, 8> VPerm2Idx;
34153 unsigned M2ZImm = 0;
34154 for (int M : Mask) {
34155 if (M == SM_SentinelUndef) {
34156 VPerm2Idx.push_back(-1);
34157 continue;
34158 }
34159 if (M == SM_SentinelZero) {
34160 M2ZImm = 2;
34161 VPerm2Idx.push_back(8);
34162 continue;
34163 }
34164 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
34165 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
34166 VPerm2Idx.push_back(Index);
34167 }
34168 V1 = DAG.getBitcast(MaskVT, V1);
34169 V2 = DAG.getBitcast(MaskVT, V2);
34170 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
34171 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
34172 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
34173 return DAG.getBitcast(RootVT, Res);
34174 }
34175
34176 // If we have 3 or more shuffle instructions or a chain involving a variable
34177 // mask, we can replace them with a single PSHUFB instruction profitably.
34178 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
34179 // instructions, but in practice PSHUFB tends to be *very* fast so we're
34180 // more aggressive.
34181 if (UnaryShuffle && AllowVariableMask &&
34182 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
34183 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
34184 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
34185 SmallVector<SDValue, 16> PSHUFBMask;
34186 int NumBytes = RootVT.getSizeInBits() / 8;
34187 int Ratio = NumBytes / NumMaskElts;
34188 for (int i = 0; i < NumBytes; ++i) {
34189 int M = Mask[i / Ratio];
34190 if (M == SM_SentinelUndef) {
34191 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
34192 continue;
34193 }
34194 if (M == SM_SentinelZero) {
34195 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
34196 continue;
34197 }
34198 M = Ratio * M + i % Ratio;
34199 assert((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ?
static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34199, __PRETTY_FUNCTION__))
;
34200 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
34201 }
34202 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
34203 Res = DAG.getBitcast(ByteVT, V1);
34204 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
34205 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
34206 return DAG.getBitcast(RootVT, Res);
34207 }
34208
34209 // With XOP, if we have a 128-bit binary input shuffle we can always combine
34210 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
34211 // slower than PSHUFB on targets that support both.
34212 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
34213 // VPPERM Mask Operation
34214 // Bits[4:0] - Byte Index (0 - 31)
34215 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
34216 SmallVector<SDValue, 16> VPPERMMask;
34217 int NumBytes = 16;
34218 int Ratio = NumBytes / NumMaskElts;
34219 for (int i = 0; i < NumBytes; ++i) {
34220 int M = Mask[i / Ratio];
34221 if (M == SM_SentinelUndef) {
34222 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
34223 continue;
34224 }
34225 if (M == SM_SentinelZero) {
34226 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
34227 continue;
34228 }
34229 M = Ratio * M + i % Ratio;
34230 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
34231 }
34232 MVT ByteVT = MVT::v16i8;
34233 V1 = DAG.getBitcast(ByteVT, V1);
34234 V2 = DAG.getBitcast(ByteVT, V2);
34235 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
34236 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
34237 return DAG.getBitcast(RootVT, Res);
34238 }
34239
34240 // If that failed and either input is extracted then try to combine as a
34241 // shuffle with the larger type.
34242 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
34243 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
34244 DAG, Subtarget))
34245 return WideShuffle;
34246
34247 // If we have a dual input shuffle then lower to VPERMV3.
34248 if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34249 ((Subtarget.hasAVX512() &&
34250 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34251 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34252 (Subtarget.hasVLX() &&
34253 (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
34254 MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
34255 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34256 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34257 (Subtarget.hasBWI() && Subtarget.hasVLX() &&
34258 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
34259 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34260 (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
34261 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
34262 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34263 V1 = DAG.getBitcast(MaskVT, V1);
34264 V2 = DAG.getBitcast(MaskVT, V2);
34265 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
34266 return DAG.getBitcast(RootVT, Res);
34267 }
34268
34269 // Failed to find any combines.
34270 return SDValue();
34271}
34272
34273// Combine an arbitrary chain of shuffles + extract_subvectors into a single
34274// instruction if possible.
34275//
34276// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
34277// type size to attempt to combine:
34278// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
34279// -->
34280// extract_subvector(shuffle(x,y,m2),0)
34281static SDValue combineX86ShuffleChainWithExtract(
34282 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
34283 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
34284 const X86Subtarget &Subtarget) {
34285 unsigned NumMaskElts = BaseMask.size();
34286 unsigned NumInputs = Inputs.size();
34287 if (NumInputs == 0)
34288 return SDValue();
34289
34290 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
34291 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
34292
34293 // Peek through subvectors.
34294 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
34295 unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
34296 for (unsigned i = 0; i != NumInputs; ++i) {
34297 SDValue &Src = WideInputs[i];
34298 unsigned &Offset = Offsets[i];
34299 Src = peekThroughBitcasts(Src);
34300 EVT BaseVT = Src.getValueType();
34301 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34302 isa<ConstantSDNode>(Src.getOperand(1))) {
34303 Offset += Src.getConstantOperandVal(1);
34304 Src = Src.getOperand(0);
34305 }
34306 WideSizeInBits = std::max(WideSizeInBits,
34307 (unsigned)Src.getValueSizeInBits());
34308 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34309, __PRETTY_FUNCTION__))
34309 "Unexpected subvector extraction")(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34309, __PRETTY_FUNCTION__))
;
34310 Offset /= BaseVT.getVectorNumElements();
34311 Offset *= NumMaskElts;
34312 }
34313
34314 // Bail if we're always extracting from the lowest subvectors,
34315 // combineX86ShuffleChain should match this for the current width.
34316 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
34317 return SDValue();
34318
34319 EVT RootVT = Root.getValueType();
34320 unsigned RootSizeInBits = RootVT.getSizeInBits();
34321 unsigned Scale = WideSizeInBits / RootSizeInBits;
34322 assert((WideSizeInBits % RootSizeInBits) == 0 &&(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34323, __PRETTY_FUNCTION__))
34323 "Unexpected subvector extraction")(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34323, __PRETTY_FUNCTION__))
;
34324
34325 // If the src vector types aren't the same, see if we can extend
34326 // them to match each other.
34327 // TODO: Support different scalar types?
34328 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
34329 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
34330 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
34331 Op.getValueType().getScalarType() != WideSVT;
34332 }))
34333 return SDValue();
34334
34335 for (SDValue &NewInput : WideInputs) {
34336 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34337, __PRETTY_FUNCTION__))
34337 "Shuffle vector size mismatch")(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34337, __PRETTY_FUNCTION__))
;
34338 if (WideSizeInBits > NewInput.getValueSizeInBits())
34339 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
34340 SDLoc(NewInput), WideSizeInBits);
34341 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34342, __PRETTY_FUNCTION__))
34342 "Unexpected subvector extraction")((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34342, __PRETTY_FUNCTION__))
;
34343 }
34344
34345 // Create new mask for larger type.
34346 for (unsigned i = 1; i != NumInputs; ++i)
34347 Offsets[i] += i * Scale * NumMaskElts;
34348
34349 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
34350 for (int &M : WideMask) {
34351 if (M < 0)
34352 continue;
34353 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
34354 }
34355 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
34356
34357 // Remove unused/repeated shuffle source ops.
34358 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
34359 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((!WideInputs.empty() && "Shuffle with no inputs detected"
) ? static_cast<void> (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34359, __PRETTY_FUNCTION__))
;
34360
34361 if (WideInputs.size() > 2)
34362 return SDValue();
34363
34364 // Increase depth for every upper subvector we've peeked through.
34365 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
34366
34367 // Attempt to combine wider chain.
34368 // TODO: Can we use a better Root?
34369 SDValue WideRoot = WideInputs[0];
34370 if (SDValue WideShuffle = combineX86ShuffleChain(
34371 WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
34372 AllowVariableMask, DAG, Subtarget)) {
34373 WideShuffle =
34374 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
34375 return DAG.getBitcast(RootVT, WideShuffle);
34376 }
34377 return SDValue();
34378}
34379
34380// Attempt to constant fold all of the constant source ops.
34381// Returns true if the entire shuffle is folded to a constant.
34382// TODO: Extend this to merge multiple constant Ops and update the mask.
34383static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
34384 ArrayRef<int> Mask, SDValue Root,
34385 bool HasVariableMask,
34386 SelectionDAG &DAG,
34387 const X86Subtarget &Subtarget) {
34388 MVT VT = Root.getSimpleValueType();
34389
34390 unsigned SizeInBits = VT.getSizeInBits();
34391 unsigned NumMaskElts = Mask.size();
34392 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
34393 unsigned NumOps = Ops.size();
34394
34395 // Extract constant bits from each source op.
34396 bool OneUseConstantOp = false;
34397 SmallVector<APInt, 16> UndefEltsOps(NumOps);
34398 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
34399 for (unsigned i = 0; i != NumOps; ++i) {
34400 SDValue SrcOp = Ops[i];
34401 OneUseConstantOp |= SrcOp.hasOneUse();
34402 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
34403 RawBitsOps[i]))
34404 return SDValue();
34405 }
34406
34407 // Only fold if at least one of the constants is only used once or
34408 // the combined shuffle has included a variable mask shuffle, this
34409 // is to avoid constant pool bloat.
34410 if (!OneUseConstantOp && !HasVariableMask)
34411 return SDValue();
34412
34413 // Shuffle the constant bits according to the mask.
34414 APInt UndefElts(NumMaskElts, 0);
34415 APInt ZeroElts(NumMaskElts, 0);
34416 APInt ConstantElts(NumMaskElts, 0);
34417 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
34418 APInt::getNullValue(MaskSizeInBits));
34419 for (unsigned i = 0; i != NumMaskElts; ++i) {
34420 int M = Mask[i];
34421 if (M == SM_SentinelUndef) {
34422 UndefElts.setBit(i);
34423 continue;
34424 } else if (M == SM_SentinelZero) {
34425 ZeroElts.setBit(i);
34426 continue;
34427 }
34428 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast
<void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34428, __PRETTY_FUNCTION__))
;
34429
34430 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
34431 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
34432
34433 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
34434 if (SrcUndefElts[SrcMaskIdx]) {
34435 UndefElts.setBit(i);
34436 continue;
34437 }
34438
34439 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
34440 APInt &Bits = SrcEltBits[SrcMaskIdx];
34441 if (!Bits) {
34442 ZeroElts.setBit(i);
34443 continue;
34444 }
34445
34446 ConstantElts.setBit(i);
34447 ConstantBitData[i] = Bits;
34448 }
34449 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()) ? static_cast
<void> (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34449, __PRETTY_FUNCTION__))
;
34450
34451 // Create the constant data.
34452 MVT MaskSVT;
34453 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
34454 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
34455 else
34456 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
34457
34458 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
34459
34460 SDLoc DL(Root);
34461 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
34462 return DAG.getBitcast(VT, CstOp);
34463}
34464
34465/// Fully generic combining of x86 shuffle instructions.
34466///
34467/// This should be the last combine run over the x86 shuffle instructions. Once
34468/// they have been fully optimized, this will recursively consider all chains
34469/// of single-use shuffle instructions, build a generic model of the cumulative
34470/// shuffle operation, and check for simpler instructions which implement this
34471/// operation. We use this primarily for two purposes:
34472///
34473/// 1) Collapse generic shuffles to specialized single instructions when
34474/// equivalent. In most cases, this is just an encoding size win, but
34475/// sometimes we will collapse multiple generic shuffles into a single
34476/// special-purpose shuffle.
34477/// 2) Look for sequences of shuffle instructions with 3 or more total
34478/// instructions, and replace them with the slightly more expensive SSSE3
34479/// PSHUFB instruction if available. We do this as the last combining step
34480/// to ensure we avoid using PSHUFB if we can implement the shuffle with
34481/// a suitable short sequence of other instructions. The PSHUFB will either
34482/// use a register or have to read from memory and so is slightly (but only
34483/// slightly) more expensive than the other shuffle instructions.
34484///
34485/// Because this is inherently a quadratic operation (for each shuffle in
34486/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
34487/// This should never be an issue in practice as the shuffle lowering doesn't
34488/// produce sequences of more than 8 instructions.
34489///
34490/// FIXME: We will currently miss some cases where the redundant shuffling
34491/// would simplify under the threshold for PSHUFB formation because of
34492/// combine-ordering. To fix this, we should do the redundant instruction
34493/// combining in this recursive walk.
34494static SDValue combineX86ShufflesRecursively(
34495 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
34496 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
34497 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
34498 const X86Subtarget &Subtarget) {
34499 assert(RootMask.size() > 0 &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34501, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is true
3
'?' condition is true
34500 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34501, __PRETTY_FUNCTION__))
34501 "Illegal shuffle root mask")((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34501, __PRETTY_FUNCTION__))
;
34502
34503 // Bound the depth of our recursive combine because this is ultimately
34504 // quadratic in nature.
34505 const unsigned MaxRecursionDepth = 8;
34506 if (Depth >= MaxRecursionDepth)
4
Assuming 'Depth' is < 'MaxRecursionDepth'
5
Taking false branch
34507 return SDValue();
34508
34509 // Directly rip through bitcasts to find the underlying operand.
34510 SDValue Op = SrcOps[SrcOpIndex];
34511 Op = peekThroughOneUseBitcasts(Op);
34512
34513 MVT VT = Op.getSimpleValueType();
34514 if (!VT.isVector())
6
Calling 'MVT::isVector'
10
Returning from 'MVT::isVector'
11
Taking false branch
34515 return SDValue(); // Bail if we hit a non-vector.
34516
34517 assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34518, __PRETTY_FUNCTION__))
12
'?' condition is true
34518 "Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34518, __PRETTY_FUNCTION__))
;
34519 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34520, __PRETTY_FUNCTION__))
13
'?' condition is true
34520 "Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34520, __PRETTY_FUNCTION__))
;
34521
34522 // Extract target shuffle mask and resolve sentinels and inputs.
34523 // TODO - determine Op's demanded elts from RootMask.
34524 SmallVector<int, 64> OpMask;
34525 SmallVector<SDValue, 2> OpInputs;
34526 APInt OpUndef, OpZero;
34527 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
34528 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
34529 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
14
Calling 'getTargetShuffleInputs'
31
Returning from 'getTargetShuffleInputs'
32
Taking false branch
34530 OpZero, DAG, Depth, false))
34531 return SDValue();
34532
34533 SmallVector<int, 64> Mask;
34534 SmallVector<SDValue, 16> Ops;
34535
34536 // We don't need to merge masks if the root is empty.
34537 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
33
Assuming 'Depth' is not equal to 0
34538 if (EmptyRoot
33.1
'EmptyRoot' is false
33.1
'EmptyRoot' is false
33.1
'EmptyRoot' is false
33.1
'EmptyRoot' is false
) {
34
Taking false branch
34539 // Only resolve zeros if it will remove an input, otherwise we might end
34540 // up in an infinite loop.
34541 bool ResolveKnownZeros = true;
34542 if (!OpZero.isNullValue()) {
34543 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
34544 for (int i = 0, e = OpMask.size(); i != e; ++i) {
34545 int M = OpMask[i];
34546 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
34547 continue;
34548 UsedInputs.setBit(M / OpMask.size());
34549 if (UsedInputs.isAllOnesValue()) {
34550 ResolveKnownZeros = false;
34551 break;
34552 }
34553 }
34554 }
34555 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
34556 ResolveKnownZeros);
34557
34558 Mask = OpMask;
34559 Ops.append(OpInputs.begin(), OpInputs.end());
34560 } else {
34561 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
35
Calling 'resolveTargetShuffleFromZeroables'
42
Returning from 'resolveTargetShuffleFromZeroables'
34562
34563 // Add the inputs to the Ops list, avoiding duplicates.
34564 Ops.append(SrcOps.begin(), SrcOps.end());
34565
34566 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
34567 // Attempt to find an existing match.
34568 SDValue InputBC = peekThroughBitcasts(Input);
34569 for (int i = 0, e = Ops.size(); i < e; ++i)
34570 if (InputBC == peekThroughBitcasts(Ops[i]))
34571 return i;
34572 // Match failed - should we replace an existing Op?
34573 if (InsertionPoint >= 0) {
34574 Ops[InsertionPoint] = Input;
34575 return InsertionPoint;
34576 }
34577 // Add to the end of the Ops list.
34578 Ops.push_back(Input);
34579 return Ops.size() - 1;
34580 };
34581
34582 SmallVector<int, 2> OpInputIdx;
34583 for (SDValue OpInput : OpInputs)
43
Assuming '__begin2' is equal to '__end2'
34584 OpInputIdx.push_back(
34585 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
34586
34587 assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
44
Assuming the condition is true
45
Calling 'SmallVectorBase::size'
47
Returning from 'SmallVectorBase::size'
48
Division by zero
34588 RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
34589 (OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
34590 OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
34591 OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
34592 "The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34592, __PRETTY_FUNCTION__))
;
34593
34594 // This function can be performance-critical, so we rely on the power-of-2
34595 // knowledge that we have about the mask sizes to replace div/rem ops with
34596 // bit-masks and shifts.
34597 assert(isPowerOf2_32(RootMask.size()) &&((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34598, __PRETTY_FUNCTION__))
34598 "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34598, __PRETTY_FUNCTION__))
;
34599 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34599, __PRETTY_FUNCTION__))
;
34600 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
34601 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
34602
34603 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
34604 unsigned RootRatio =
34605 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
34606 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
34607 assert((RootRatio == 1 || OpRatio == 1) &&(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34608, __PRETTY_FUNCTION__))
34608 "Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34608, __PRETTY_FUNCTION__))
;
34609
34610 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34610, __PRETTY_FUNCTION__))
;
34611 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34611, __PRETTY_FUNCTION__))
;
34612 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34612, __PRETTY_FUNCTION__))
;
34613 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
34614 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
34615
34616 Mask.resize(MaskWidth, SM_SentinelUndef);
34617
34618 // Merge this shuffle operation's mask into our accumulated mask. Note that
34619 // this shuffle's mask will be the first applied to the input, followed by
34620 // the root mask to get us all the way to the root value arrangement. The
34621 // reason for this order is that we are recursing up the operation chain.
34622 for (unsigned i = 0; i < MaskWidth; ++i) {
34623 unsigned RootIdx = i >> RootRatioLog2;
34624 if (RootMask[RootIdx] < 0) {
34625 // This is a zero or undef lane, we're done.
34626 Mask[i] = RootMask[RootIdx];
34627 continue;
34628 }
34629
34630 unsigned RootMaskedIdx =
34631 RootRatio == 1
34632 ? RootMask[RootIdx]
34633 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
34634
34635 // Just insert the scaled root mask value if it references an input other
34636 // than the SrcOp we're currently inserting.
34637 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
34638 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
34639 Mask[i] = RootMaskedIdx;
34640 continue;
34641 }
34642
34643 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
34644 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
34645 if (OpMask[OpIdx] < 0) {
34646 // The incoming lanes are zero or undef, it doesn't matter which ones we
34647 // are using.
34648 Mask[i] = OpMask[OpIdx];
34649 continue;
34650 }
34651
34652 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
34653 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
34654 : (OpMask[OpIdx] << OpRatioLog2) +
34655 (RootMaskedIdx & (OpRatio - 1));
34656
34657 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
34658 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
34659 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"
) ? static_cast<void> (0) : __assert_fail ("0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34659, __PRETTY_FUNCTION__))
;
34660 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
34661
34662 Mask[i] = OpMaskedIdx;
34663 }
34664 }
34665
34666 // Remove unused/repeated shuffle source ops.
34667 resolveTargetShuffleInputsAndMask(Ops, Mask);
34668
34669 // Handle the all undef/zero cases early.
34670 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
34671 return DAG.getUNDEF(Root.getValueType());
34672
34673 // TODO - should we handle the mixed zero/undef case as well? Just returning
34674 // a zero mask will lose information on undef elements possibly reducing
34675 // future combine possibilities.
34676 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
34677 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
34678 SDLoc(Root));
34679
34680 assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ?
static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34680, __PRETTY_FUNCTION__))
;
34681 HasVariableMask |= IsOpVariableMask;
34682
34683 // Update the list of shuffle nodes that have been combined so far.
34684 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
34685 SrcNodes.end());
34686 CombinedNodes.push_back(Op.getNode());
34687
34688 // See if we can recurse into each shuffle source op (if it's a target
34689 // shuffle). The source op should only be generally combined if it either has
34690 // a single use (i.e. current Op) or all its users have already been combined,
34691 // if not then we can still combine but should prevent generation of variable
34692 // shuffles to avoid constant pool bloat.
34693 // Don't recurse if we already have more source ops than we can combine in
34694 // the remaining recursion depth.
34695 if (Ops.size() < (MaxRecursionDepth - Depth)) {
34696 for (int i = 0, e = Ops.size(); i < e; ++i) {
34697 // For empty roots, we need to resolve zeroable elements before combining
34698 // them with other shuffles.
34699 SmallVector<int, 64> ResolvedMask = Mask;
34700 if (EmptyRoot)
34701 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
34702 bool AllowVar = false;
34703 if (Ops[i].getNode()->hasOneUse() ||
34704 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
34705 AllowVar = AllowVariableMask;
34706 if (SDValue Res = combineX86ShufflesRecursively(
34707 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
34708 HasVariableMask, AllowVar, DAG, Subtarget))
34709 return Res;
34710 }
34711 }
34712
34713 // Attempt to constant fold all of the constant source ops.
34714 if (SDValue Cst = combineX86ShufflesConstants(
34715 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
34716 return Cst;
34717
34718 // We can only combine unary and binary shuffle mask cases.
34719 if (Ops.size() <= 2) {
34720 // Minor canonicalization of the accumulated shuffle mask to make it easier
34721 // to match below. All this does is detect masks with sequential pairs of
34722 // elements, and shrink them to the half-width mask. It does this in a loop
34723 // so it will reduce the size of the mask to the minimal width mask which
34724 // performs an equivalent shuffle.
34725 SmallVector<int, 64> WidenedMask;
34726 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
34727 Mask = std::move(WidenedMask);
34728 }
34729
34730 // Canonicalization of binary shuffle masks to improve pattern matching by
34731 // commuting the inputs.
34732 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
34733 ShuffleVectorSDNode::commuteMask(Mask);
34734 std::swap(Ops[0], Ops[1]);
34735 }
34736
34737 // Finally, try to combine into a single shuffle instruction.
34738 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
34739 AllowVariableMask, DAG, Subtarget);
34740 }
34741
34742 // If that failed and any input is extracted then try to combine as a
34743 // shuffle with the larger type.
34744 return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
34745 HasVariableMask, AllowVariableMask,
34746 DAG, Subtarget);
34747}
34748
34749/// Helper entry wrapper to combineX86ShufflesRecursively.
34750static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
34751 const X86Subtarget &Subtarget) {
34752 return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
34753 /*HasVarMask*/ false,
34754 /*AllowVarMask*/ true, DAG, Subtarget);
34755}
34756
34757/// Get the PSHUF-style mask from PSHUF node.
34758///
34759/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
34760/// PSHUF-style masks that can be reused with such instructions.
34761static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
34762 MVT VT = N.getSimpleValueType();
34763 SmallVector<int, 4> Mask;
34764 SmallVector<SDValue, 2> Ops;
34765 bool IsUnary;
34766 bool HaveMask =
34767 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
34768 (void)HaveMask;
34769 assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34769, __PRETTY_FUNCTION__))
;
34770
34771 // If we have more than 128-bits, only the low 128-bits of shuffle mask
34772 // matter. Check that the upper masks are repeats and remove them.
34773 if (VT.getSizeInBits() > 128) {
34774 int LaneElts = 128 / VT.getScalarSizeInBits();
34775#ifndef NDEBUG
34776 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
34777 for (int j = 0; j < LaneElts; ++j)
34778 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34779, __PRETTY_FUNCTION__))
34779 "Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34779, __PRETTY_FUNCTION__))
;
34780#endif
34781 Mask.resize(LaneElts);
34782 }
34783
34784 switch (N.getOpcode()) {
34785 case X86ISD::PSHUFD:
34786 return Mask;
34787 case X86ISD::PSHUFLW:
34788 Mask.resize(4);
34789 return Mask;
34790 case X86ISD::PSHUFHW:
34791 Mask.erase(Mask.begin(), Mask.begin() + 4);
34792 for (int &M : Mask)
34793 M -= 4;
34794 return Mask;
34795 default:
34796 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34796)
;
34797 }
34798}
34799
34800/// Search for a combinable shuffle across a chain ending in pshufd.
34801///
34802/// We walk up the chain and look for a combinable shuffle, skipping over
34803/// shuffles that we could hoist this shuffle's transformation past without
34804/// altering anything.
34805static SDValue
34806combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
34807 SelectionDAG &DAG) {
34808 assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34809, __PRETTY_FUNCTION__))
34809 "Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34809, __PRETTY_FUNCTION__))
;
34810 SDLoc DL(N);
34811
34812 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
34813 // of the shuffles in the chain so that we can form a fresh chain to replace
34814 // this one.
34815 SmallVector<SDValue, 8> Chain;
34816 SDValue V = N.getOperand(0);
34817 for (; V.hasOneUse(); V = V.getOperand(0)) {
34818 switch (V.getOpcode()) {
34819 default:
34820 return SDValue(); // Nothing combined!
34821
34822 case ISD::BITCAST:
34823 // Skip bitcasts as we always know the type for the target specific
34824 // instructions.
34825 continue;
34826
34827 case X86ISD::PSHUFD:
34828 // Found another dword shuffle.
34829 break;
34830
34831 case X86ISD::PSHUFLW:
34832 // Check that the low words (being shuffled) are the identity in the
34833 // dword shuffle, and the high words are self-contained.
34834 if (Mask[0] != 0 || Mask[1] != 1 ||
34835 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
34836 return SDValue();
34837
34838 Chain.push_back(V);
34839 continue;
34840
34841 case X86ISD::PSHUFHW:
34842 // Check that the high words (being shuffled) are the identity in the
34843 // dword shuffle, and the low words are self-contained.
34844 if (Mask[2] != 2 || Mask[3] != 3 ||
34845 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
34846 return SDValue();
34847
34848 Chain.push_back(V);
34849 continue;
34850
34851 case X86ISD::UNPCKL:
34852 case X86ISD::UNPCKH:
34853 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
34854 // shuffle into a preceding word shuffle.
34855 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
34856 V.getSimpleValueType().getVectorElementType() != MVT::i16)
34857 return SDValue();
34858
34859 // Search for a half-shuffle which we can combine with.
34860 unsigned CombineOp =
34861 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
34862 if (V.getOperand(0) != V.getOperand(1) ||
34863 !V->isOnlyUserOf(V.getOperand(0).getNode()))
34864 return SDValue();
34865 Chain.push_back(V);
34866 V = V.getOperand(0);
34867 do {
34868 switch (V.getOpcode()) {
34869 default:
34870 return SDValue(); // Nothing to combine.
34871
34872 case X86ISD::PSHUFLW:
34873 case X86ISD::PSHUFHW:
34874 if (V.getOpcode() == CombineOp)
34875 break;
34876
34877 Chain.push_back(V);
34878
34879 LLVM_FALLTHROUGH[[gnu::fallthrough]];
34880 case ISD::BITCAST:
34881 V = V.getOperand(0);
34882 continue;
34883 }
34884 break;
34885 } while (V.hasOneUse());
34886 break;
34887 }
34888 // Break out of the loop if we break out of the switch.
34889 break;
34890 }
34891
34892 if (!V.hasOneUse())
34893 // We fell out of the loop without finding a viable combining instruction.
34894 return SDValue();
34895
34896 // Merge this node's mask and our incoming mask.
34897 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
34898 for (int &M : Mask)
34899 M = VMask[M];
34900 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
34901 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
34902
34903 // Rebuild the chain around this new shuffle.
34904 while (!Chain.empty()) {
34905 SDValue W = Chain.pop_back_val();
34906
34907 if (V.getValueType() != W.getOperand(0).getValueType())
34908 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
34909
34910 switch (W.getOpcode()) {
34911 default:
34912 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34912)
;
34913
34914 case X86ISD::UNPCKL:
34915 case X86ISD::UNPCKH:
34916 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
34917 break;
34918
34919 case X86ISD::PSHUFD:
34920 case X86ISD::PSHUFLW:
34921 case X86ISD::PSHUFHW:
34922 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
34923 break;
34924 }
34925 }
34926 if (V.getValueType() != N.getValueType())
34927 V = DAG.getBitcast(N.getValueType(), V);
34928
34929 // Return the new chain to replace N.
34930 return V;
34931}
34932
34933// Attempt to commute shufps LHS loads:
34934// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
34935static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
34936 SelectionDAG &DAG) {
34937 // TODO: Add vXf64 support.
34938 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
34939 return SDValue();
34940
34941 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
34942 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
34943 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
34944 return SDValue();
34945 SDValue N0 = V.getOperand(0);
34946 SDValue N1 = V.getOperand(1);
34947 unsigned Imm = V.getConstantOperandVal(2);
34948 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
34949 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
34950 return SDValue();
34951 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
34952 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
34953 DAG.getTargetConstant(Imm, DL, MVT::i8));
34954 };
34955
34956 switch (N.getOpcode()) {
34957 case X86ISD::VPERMILPI:
34958 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
34959 unsigned Imm = N.getConstantOperandVal(1);
34960 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
34961 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
34962 }
34963 break;
34964 case X86ISD::SHUFP: {
34965 SDValue N0 = N.getOperand(0);
34966 SDValue N1 = N.getOperand(1);
34967 unsigned Imm = N.getConstantOperandVal(2);
34968 if (N0 == N1) {
34969 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
34970 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
34971 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
34972 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
34973 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
34974 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
34975 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
34976 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
34977 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
34978 }
34979 break;
34980 }
34981 }
34982
34983 return SDValue();
34984}
34985
34986/// Try to combine x86 target specific shuffles.
34987static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
34988 TargetLowering::DAGCombinerInfo &DCI,
34989 const X86Subtarget &Subtarget) {
34990 SDLoc DL(N);
34991 MVT VT = N.getSimpleValueType();
34992 SmallVector<int, 4> Mask;
34993 unsigned Opcode = N.getOpcode();
34994
34995 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
34996 // single instruction.
34997 if (VT.getScalarSizeInBits() == 64 &&
34998 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
34999 Opcode == X86ISD::UNPCKL)) {
35000 auto BC0 = peekThroughBitcasts(N.getOperand(0));
35001 auto BC1 = peekThroughBitcasts(N.getOperand(1));
35002 EVT VT0 = BC0.getValueType();
35003 EVT VT1 = BC1.getValueType();
35004 unsigned Opcode0 = BC0.getOpcode();
35005 unsigned Opcode1 = BC1.getOpcode();
35006 if (Opcode0 == Opcode1 && VT0 == VT1 &&
35007 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
35008 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
35009 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
35010 SDValue Lo, Hi;
35011 if (Opcode == X86ISD::MOVSD) {
35012 Lo = BC1.getOperand(0);
35013 Hi = BC0.getOperand(1);
35014 } else {
35015 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
35016 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
35017 }
35018 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
35019 return DAG.getBitcast(VT, Horiz);
35020 }
35021 }
35022
35023 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
35024 return R;
35025
35026 switch (Opcode) {
35027 case X86ISD::VBROADCAST: {
35028 SDValue Src = N.getOperand(0);
35029 SDValue BC = peekThroughBitcasts(Src);
35030 EVT SrcVT = Src.getValueType();
35031 EVT BCVT = BC.getValueType();
35032
35033 // If broadcasting from another shuffle, attempt to simplify it.
35034 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
35035 if (isTargetShuffle(BC.getOpcode()) &&
35036 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
35037 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
35038 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
35039 SM_SentinelUndef);
35040 for (unsigned i = 0; i != Scale; ++i)
35041 DemandedMask[i] = i;
35042 if (SDValue Res = combineX86ShufflesRecursively(
35043 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
35044 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
35045 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
35046 DAG.getBitcast(SrcVT, Res));
35047 }
35048
35049 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
35050 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
35051 if (Src.getOpcode() == ISD::BITCAST &&
35052 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
35053 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
35054 VT.getVectorNumElements());
35055 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
35056 }
35057
35058 // Reduce broadcast source vector to lowest 128-bits.
35059 if (SrcVT.getSizeInBits() > 128)
35060 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
35061 extract128BitVector(Src, 0, DAG, DL));
35062
35063 // broadcast(scalar_to_vector(x)) -> broadcast(x).
35064 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
35065 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
35066
35067 // Share broadcast with the longest vector and extract low subvector (free).
35068 for (SDNode *User : Src->uses())
35069 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
35070 User->getValueSizeInBits(0) > VT.getSizeInBits()) {
35071 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
35072 VT.getSizeInBits());
35073 }
35074
35075 // vbroadcast(scalarload X) -> vbroadcast_load X
35076 // For float loads, extract other uses of the scalar from the broadcast.
35077 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
35078 ISD::isNormalLoad(Src.getNode())) {
35079 LoadSDNode *LN = cast<LoadSDNode>(Src);
35080 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35081 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35082 SDValue BcastLd =
35083 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35084 LN->getMemoryVT(), LN->getMemOperand());
35085 // If the load value is used only by N, replace it via CombineTo N.
35086 bool NoReplaceExtract = Src.hasOneUse();
35087 DCI.CombineTo(N.getNode(), BcastLd);
35088 if (NoReplaceExtract) {
35089 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35090 DCI.recursivelyDeleteUnusedNodes(LN);
35091 } else {
35092 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
35093 DAG.getIntPtrConstant(0, DL));
35094 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
35095 }
35096 return N; // Return N so it doesn't get rechecked!
35097 }
35098
35099 return SDValue();
35100 }
35101 case X86ISD::BLENDI: {
35102 SDValue N0 = N.getOperand(0);
35103 SDValue N1 = N.getOperand(1);
35104
35105 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
35106 // TODO: Handle MVT::v16i16 repeated blend mask.
35107 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
35108 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
35109 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
35110 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
35111 SrcVT.getScalarSizeInBits() >= 32) {
35112 unsigned BlendMask = N.getConstantOperandVal(2);
35113 unsigned Size = VT.getVectorNumElements();
35114 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
35115 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
35116 return DAG.getBitcast(
35117 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
35118 N1.getOperand(0),
35119 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
35120 }
35121 }
35122 return SDValue();
35123 }
35124 case X86ISD::VPERMI: {
35125 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
35126 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
35127 SDValue N0 = N.getOperand(0);
35128 SDValue N1 = N.getOperand(1);
35129 unsigned EltSizeInBits = VT.getScalarSizeInBits();
35130 if (N0.getOpcode() == ISD::BITCAST &&
35131 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
35132 SDValue Src = N0.getOperand(0);
35133 EVT SrcVT = Src.getValueType();
35134 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
35135 return DAG.getBitcast(VT, Res);
35136 }
35137 return SDValue();
35138 }
35139 case X86ISD::VPERM2X128: {
35140 // If both 128-bit values were inserted into high halves of 256-bit values,
35141 // the shuffle can be reduced to a concatenation of subvectors:
35142 // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
35143 // Note: We are only looking for the exact high/high shuffle mask because we
35144 // expect to fold other similar patterns before creating this opcode.
35145 SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
35146 SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
35147 unsigned Imm = N.getConstantOperandVal(2);
35148 if (!(Imm == 0x31 &&
35149 Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
35150 Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
35151 Ins0.getValueType() == Ins1.getValueType() &&
35152 isa<ConstantSDNode>(Ins0.getOperand(2)) &&
35153 isa<ConstantSDNode>(Ins1.getOperand(2))))
35154 return SDValue();
35155
35156 SDValue X = Ins0.getOperand(1);
35157 SDValue Y = Ins1.getOperand(1);
35158 unsigned C1 = Ins0.getConstantOperandVal(2);
35159 unsigned C2 = Ins1.getConstantOperandVal(2);
35160 MVT SrcVT = X.getSimpleValueType();
35161 unsigned SrcElts = SrcVT.getVectorNumElements();
35162 if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
35163 C1 != SrcElts || C2 != SrcElts)
35164 return SDValue();
35165
35166 return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
35167 Ins1.getValueType(), X, Y));
35168 }
35169 case X86ISD::PSHUFD:
35170 case X86ISD::PSHUFLW:
35171 case X86ISD::PSHUFHW:
35172 Mask = getPSHUFShuffleMask(N);
35173 assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35173, __PRETTY_FUNCTION__))
;
35174 break;
35175 case X86ISD::MOVSD:
35176 case X86ISD::MOVSS: {
35177 SDValue N0 = N.getOperand(0);
35178 SDValue N1 = N.getOperand(1);
35179
35180 // Canonicalize scalar FPOps:
35181 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
35182 // If commutable, allow OP(N1[0], N0[0]).
35183 unsigned Opcode1 = N1.getOpcode();
35184 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
35185 Opcode1 == ISD::FDIV) {
35186 SDValue N10 = N1.getOperand(0);
35187 SDValue N11 = N1.getOperand(1);
35188 if (N10 == N0 ||
35189 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
35190 if (N10 != N0)
35191 std::swap(N10, N11);
35192 MVT SVT = VT.getVectorElementType();
35193 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
35194 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
35195 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
35196 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
35197 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
35198 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
35199 }
35200 }
35201
35202 return SDValue();
35203 }
35204 case X86ISD::INSERTPS: {
35205 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35205, __PRETTY_FUNCTION__))
;
35206 SDValue Op0 = N.getOperand(0);
35207 SDValue Op1 = N.getOperand(1);
35208 SDValue Op2 = N.getOperand(2);
35209 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
35210 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
35211 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
35212 unsigned ZeroMask = InsertPSMask & 0xF;
35213
35214 // If we zero out all elements from Op0 then we don't need to reference it.
35215 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
35216 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
35217 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35218
35219 // If we zero out the element from Op1 then we don't need to reference it.
35220 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
35221 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
35222 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35223
35224 // Attempt to merge insertps Op1 with an inner target shuffle node.
35225 SmallVector<int, 8> TargetMask1;
35226 SmallVector<SDValue, 2> Ops1;
35227 APInt KnownUndef1, KnownZero1;
35228 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
35229 KnownZero1)) {
35230 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
35231 // Zero/UNDEF insertion - zero out element and remove dependency.
35232 InsertPSMask |= (1u << DstIdx);
35233 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
35234 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35235 }
35236 // Update insertps mask srcidx and reference the source input directly.
35237 int M = TargetMask1[SrcIdx];
35238 assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35238, __PRETTY_FUNCTION__))
;
35239 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
35240 Op1 = Ops1[M < 4 ? 0 : 1];
35241 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
35242 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35243 }
35244
35245 // Attempt to merge insertps Op0 with an inner target shuffle node.
35246 SmallVector<int, 8> TargetMask0;
35247 SmallVector<SDValue, 2> Ops0;
35248 APInt KnownUndef0, KnownZero0;
35249 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
35250 KnownZero0)) {
35251 bool Updated = false;
35252 bool UseInput00 = false;
35253 bool UseInput01 = false;
35254 for (int i = 0; i != 4; ++i) {
35255 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
35256 // No change if element is already zero or the inserted element.
35257 continue;
35258 } else if (KnownUndef0[i] || KnownZero0[i]) {
35259 // If the target mask is undef/zero then we must zero the element.
35260 InsertPSMask |= (1u << i);
35261 Updated = true;
35262 continue;
35263 }
35264
35265 // The input vector element must be inline.
35266 int M = TargetMask0[i];
35267 if (M != i && M != (i + 4))
35268 return SDValue();
35269
35270 // Determine which inputs of the target shuffle we're using.
35271 UseInput00 |= (0 <= M && M < 4);
35272 UseInput01 |= (4 <= M);
35273 }
35274
35275 // If we're not using both inputs of the target shuffle then use the
35276 // referenced input directly.
35277 if (UseInput00 && !UseInput01) {
35278 Updated = true;
35279 Op0 = Ops0[0];
35280 } else if (!UseInput00 && UseInput01) {
35281 Updated = true;
35282 Op0 = Ops0[1];
35283 }
35284
35285 if (Updated)
35286 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
35287 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35288 }
35289
35290 // If we're inserting an element from a vbroadcast load, fold the
35291 // load into the X86insertps instruction. We need to convert the scalar
35292 // load to a vector and clear the source lane of the INSERTPS control.
35293 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
35294 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
35295 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
35296 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
35297 MemIntr->getBasePtr(),
35298 MemIntr->getMemOperand());
35299 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
35300 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
35301 Load),
35302 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
35303 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
35304 return Insert;
35305 }
35306 }
35307
35308 return SDValue();
35309 }
35310 default:
35311 return SDValue();
35312 }
35313
35314 // Nuke no-op shuffles that show up after combining.
35315 if (isNoopShuffleMask(Mask))
35316 return N.getOperand(0);
35317
35318 // Look for simplifications involving one or two shuffle instructions.
35319 SDValue V = N.getOperand(0);
35320 switch (N.getOpcode()) {
35321 default:
35322 break;
35323 case X86ISD::PSHUFLW:
35324 case X86ISD::PSHUFHW:
35325 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35325, __PRETTY_FUNCTION__))
;
35326
35327 // See if this reduces to a PSHUFD which is no more expensive and can
35328 // combine with more operations. Note that it has to at least flip the
35329 // dwords as otherwise it would have been removed as a no-op.
35330 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
35331 int DMask[] = {0, 1, 2, 3};
35332 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
35333 DMask[DOffset + 0] = DOffset + 1;
35334 DMask[DOffset + 1] = DOffset + 0;
35335 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
35336 V = DAG.getBitcast(DVT, V);
35337 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
35338 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
35339 return DAG.getBitcast(VT, V);
35340 }
35341
35342 // Look for shuffle patterns which can be implemented as a single unpack.
35343 // FIXME: This doesn't handle the location of the PSHUFD generically, and
35344 // only works when we have a PSHUFD followed by two half-shuffles.
35345 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
35346 (V.getOpcode() == X86ISD::PSHUFLW ||
35347 V.getOpcode() == X86ISD::PSHUFHW) &&
35348 V.getOpcode() != N.getOpcode() &&
35349 V.hasOneUse()) {
35350 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
35351 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
35352 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
35353 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
35354 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
35355 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
35356 int WordMask[8];
35357 for (int i = 0; i < 4; ++i) {
35358 WordMask[i + NOffset] = Mask[i] + NOffset;
35359 WordMask[i + VOffset] = VMask[i] + VOffset;
35360 }
35361 // Map the word mask through the DWord mask.
35362 int MappedMask[8];
35363 for (int i = 0; i < 8; ++i)
35364 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
35365 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
35366 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
35367 // We can replace all three shuffles with an unpack.
35368 V = DAG.getBitcast(VT, D.getOperand(0));
35369 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
35370 : X86ISD::UNPCKH,
35371 DL, VT, V, V);
35372 }
35373 }
35374 }
35375
35376 break;
35377
35378 case X86ISD::PSHUFD:
35379 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
35380 return NewN;
35381
35382 break;
35383 }
35384
35385 return SDValue();
35386}
35387
35388/// Checks if the shuffle mask takes subsequent elements
35389/// alternately from two vectors.
35390/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
35391static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
35392
35393 int ParitySrc[2] = {-1, -1};
35394 unsigned Size = Mask.size();
35395 for (unsigned i = 0; i != Size; ++i) {
35396 int M = Mask[i];
35397 if (M < 0)
35398 continue;
35399
35400 // Make sure we are using the matching element from the input.
35401 if ((M % Size) != i)
35402 return false;
35403
35404 // Make sure we use the same input for all elements of the same parity.
35405 int Src = M / Size;
35406 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
35407 return false;
35408 ParitySrc[i % 2] = Src;
35409 }
35410
35411 // Make sure each input is used.
35412 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
35413 return false;
35414
35415 Op0Even = ParitySrc[0] == 0;
35416 return true;
35417}
35418
35419/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
35420/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
35421/// are written to the parameters \p Opnd0 and \p Opnd1.
35422///
35423/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
35424/// so it is easier to generically match. We also insert dummy vector shuffle
35425/// nodes for the operands which explicitly discard the lanes which are unused
35426/// by this operation to try to flow through the rest of the combiner
35427/// the fact that they're unused.
35428static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
35429 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
35430 bool &IsSubAdd) {
35431
35432 EVT VT = N->getValueType(0);
35433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35434 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
35435 !VT.getSimpleVT().isFloatingPoint())
35436 return false;
35437
35438 // We only handle target-independent shuffles.
35439 // FIXME: It would be easy and harmless to use the target shuffle mask
35440 // extraction tool to support more.
35441 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
35442 return false;
35443
35444 SDValue V1 = N->getOperand(0);
35445 SDValue V2 = N->getOperand(1);
35446
35447 // Make sure we have an FADD and an FSUB.
35448 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
35449 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
35450 V1.getOpcode() == V2.getOpcode())
35451 return false;
35452
35453 // If there are other uses of these operations we can't fold them.
35454 if (!V1->hasOneUse() || !V2->hasOneUse())
35455 return false;
35456
35457 // Ensure that both operations have the same operands. Note that we can
35458 // commute the FADD operands.
35459 SDValue LHS, RHS;
35460 if (V1.getOpcode() == ISD::FSUB) {
35461 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
35462 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
35463 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
35464 return false;
35465 } else {
35466 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((V2.getOpcode() == ISD::FSUB && "Unexpected opcode")
? static_cast<void> (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35466, __PRETTY_FUNCTION__))
;
35467 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
35468 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
35469 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
35470 return false;
35471 }
35472
35473 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35474 bool Op0Even;
35475 if (!isAddSubOrSubAddMask(Mask, Op0Even))
35476 return false;
35477
35478 // It's a subadd if the vector in the even parity is an FADD.
35479 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
35480 : V2->getOpcode() == ISD::FADD;
35481
35482 Opnd0 = LHS;
35483 Opnd1 = RHS;
35484 return true;
35485}
35486
35487/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
35488static SDValue combineShuffleToFMAddSub(SDNode *N,
35489 const X86Subtarget &Subtarget,
35490 SelectionDAG &DAG) {
35491 // We only handle target-independent shuffles.
35492 // FIXME: It would be easy and harmless to use the target shuffle mask
35493 // extraction tool to support more.
35494 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
35495 return SDValue();
35496
35497 MVT VT = N->getSimpleValueType(0);
35498 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35499 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
35500 return SDValue();
35501
35502 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
35503 SDValue Op0 = N->getOperand(0);
35504 SDValue Op1 = N->getOperand(1);
35505 SDValue FMAdd = Op0, FMSub = Op1;
35506 if (FMSub.getOpcode() != X86ISD::FMSUB)
35507 std::swap(FMAdd, FMSub);
35508
35509 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
35510 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
35511 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
35512 FMAdd.getOperand(2) != FMSub.getOperand(2))
35513 return SDValue();
35514
35515 // Check for correct shuffle mask.
35516 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35517 bool Op0Even;
35518 if (!isAddSubOrSubAddMask(Mask, Op0Even))
35519 return SDValue();
35520
35521 // FMAddSub takes zeroth operand from FMSub node.
35522 SDLoc DL(N);
35523 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
35524 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
35525 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
35526 FMAdd.getOperand(2));
35527}
35528
35529/// Try to combine a shuffle into a target-specific add-sub or
35530/// mul-add-sub node.
35531static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
35532 const X86Subtarget &Subtarget,
35533 SelectionDAG &DAG) {
35534 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
35535 return V;
35536
35537 SDValue Opnd0, Opnd1;
35538 bool IsSubAdd;
35539 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
35540 return SDValue();
35541
35542 MVT VT = N->getSimpleValueType(0);
35543 SDLoc DL(N);
35544
35545 // Try to generate X86ISD::FMADDSUB node here.
35546 SDValue Opnd2;
35547 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
35548 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
35549 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
35550 }
35551
35552 if (IsSubAdd)
35553 return SDValue();
35554
35555 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
35556 // the ADDSUB idiom has been successfully recognized. There are no known
35557 // X86 targets with 512-bit ADDSUB instructions!
35558 if (VT.is512BitVector())
35559 return SDValue();
35560
35561 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
35562}
35563
35564// We are looking for a shuffle where both sources are concatenated with undef
35565// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
35566// if we can express this as a single-source shuffle, that's preferable.
35567static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
35568 const X86Subtarget &Subtarget) {
35569 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
35570 return SDValue();
35571
35572 EVT VT = N->getValueType(0);
35573
35574 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
35575 if (!VT.is128BitVector() && !VT.is256BitVector())
35576 return SDValue();
35577
35578 if (VT.getVectorElementType() != MVT::i32 &&
35579 VT.getVectorElementType() != MVT::i64 &&
35580 VT.getVectorElementType() != MVT::f32 &&
35581 VT.getVectorElementType() != MVT::f64)
35582 return SDValue();
35583
35584 SDValue N0 = N->getOperand(0);
35585 SDValue N1 = N->getOperand(1);
35586
35587 // Check that both sources are concats with undef.
35588 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
35589 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
35590 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
35591 !N1.getOperand(1).isUndef())
35592 return SDValue();
35593
35594 // Construct the new shuffle mask. Elements from the first source retain their
35595 // index, but elements from the second source no longer need to skip an undef.
35596 SmallVector<int, 8> Mask;
35597 int NumElts = VT.getVectorNumElements();
35598
35599 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
35600 for (int Elt : SVOp->getMask())
35601 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
35602
35603 SDLoc DL(N);
35604 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
35605 N1.getOperand(0));
35606 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
35607}
35608
35609/// Eliminate a redundant shuffle of a horizontal math op.
35610static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
35611 unsigned Opcode = N->getOpcode();
35612 if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
35613 if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
35614 return SDValue();
35615
35616 // For a broadcast, peek through an extract element of index 0 to find the
35617 // horizontal op: broadcast (ext_vec_elt HOp, 0)
35618 EVT VT = N->getValueType(0);
35619 if (Opcode == X86ISD::VBROADCAST) {
35620 SDValue SrcOp = N->getOperand(0);
35621 if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35622 SrcOp.getValueType() == MVT::f64 &&
35623 SrcOp.getOperand(0).getValueType() == VT &&
35624 isNullConstant(SrcOp.getOperand(1)))
35625 N = SrcOp.getNode();
35626 }
35627
35628 SDValue HOp = N->getOperand(0);
35629 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
35630 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
35631 return SDValue();
35632
35633 // 128-bit horizontal math instructions are defined to operate on adjacent
35634 // lanes of each operand as:
35635 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
35636 // ...similarly for v2f64 and v8i16.
35637 if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
35638 HOp.getOperand(0) != HOp.getOperand(1))
35639 return SDValue();
35640
35641 // The shuffle that we are eliminating may have allowed the horizontal op to
35642 // have an undemanded (undefined) operand. Duplicate the other (defined)
35643 // operand to ensure that the results are defined across all lanes without the
35644 // shuffle.
35645 auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
35646 SDValue X;
35647 if (HorizOp.getOperand(0).isUndef()) {
35648 assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(1).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35648, __PRETTY_FUNCTION__))
;
35649 X = HorizOp.getOperand(1);
35650 } else if (HorizOp.getOperand(1).isUndef()) {
35651 assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(0).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35651, __PRETTY_FUNCTION__))
;
35652 X = HorizOp.getOperand(0);
35653 } else {
35654 return HorizOp;
35655 }
35656 return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
35657 HorizOp.getValueType(), X, X);
35658 };
35659
35660 // When the operands of a horizontal math op are identical, the low half of
35661 // the result is the same as the high half. If a target shuffle is also
35662 // replicating low and high halves (and without changing the type/length of
35663 // the vector), we don't need the shuffle.
35664 if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
35665 if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
35666 // movddup (hadd X, X) --> hadd X, X
35667 // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
35668 assert((HOp.getValueType() == MVT::v2f64 ||(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35669, __PRETTY_FUNCTION__))
35669 HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op")(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35669, __PRETTY_FUNCTION__))
;
35670 return updateHOp(HOp, DAG);
35671 }
35672 return SDValue();
35673 }
35674
35675 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
35676 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35677 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
35678 // but this should be tied to whatever horizontal op matching and shuffle
35679 // canonicalization are producing.
35680 if (HOp.getValueSizeInBits() == 128 &&
35681 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
35682 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
35683 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
35684 return updateHOp(HOp, DAG);
35685
35686 if (HOp.getValueSizeInBits() == 256 &&
35687 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
35688 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
35689 isTargetShuffleEquivalent(
35690 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
35691 return updateHOp(HOp, DAG);
35692
35693 return SDValue();
35694}
35695
35696/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
35697/// low half of each source vector and does not set any high half elements in
35698/// the destination vector, narrow the shuffle to half its original size.
35699static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
35700 if (!Shuf->getValueType(0).isSimple())
35701 return SDValue();
35702 MVT VT = Shuf->getSimpleValueType(0);
35703 if (!VT.is256BitVector() && !VT.is512BitVector())
35704 return SDValue();
35705
35706 // See if we can ignore all of the high elements of the shuffle.
35707 ArrayRef<int> Mask = Shuf->getMask();
35708 if (!isUndefUpperHalf(Mask))
35709 return SDValue();
35710
35711 // Check if the shuffle mask accesses only the low half of each input vector
35712 // (half-index output is 0 or 2).
35713 int HalfIdx1, HalfIdx2;
35714 SmallVector<int, 8> HalfMask(Mask.size() / 2);
35715 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
35716 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
35717 return SDValue();
35718
35719 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
35720 // The trick is knowing that all of the insert/extract are actually free
35721 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
35722 // of narrow inputs into a narrow output, and that is always cheaper than
35723 // the wide shuffle that we started with.
35724 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
35725 Shuf->getOperand(1), HalfMask, HalfIdx1,
35726 HalfIdx2, false, DAG, /*UseConcat*/true);
35727}
35728
35729static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
35730 TargetLowering::DAGCombinerInfo &DCI,
35731 const X86Subtarget &Subtarget) {
35732 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
35733 if (SDValue V = narrowShuffle(Shuf, DAG))
35734 return V;
35735
35736 // If we have legalized the vector types, look for blends of FADD and FSUB
35737 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
35738 SDLoc dl(N);
35739 EVT VT = N->getValueType(0);
35740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35741 if (TLI.isTypeLegal(VT)) {
35742 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
35743 return AddSub;
35744
35745 if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
35746 return HAddSub;
35747 }
35748
35749 // Attempt to combine into a vector load/broadcast.
35750 if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
35751 return LD;
35752
35753 // For AVX2, we sometimes want to combine
35754 // (vector_shuffle <mask> (concat_vectors t1, undef)
35755 // (concat_vectors t2, undef))
35756 // Into:
35757 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
35758 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
35759 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
35760 return ShufConcat;
35761
35762 if (isTargetShuffle(N->getOpcode())) {
35763 SDValue Op(N, 0);
35764 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
35765 return Shuffle;
35766
35767 // Try recursively combining arbitrary sequences of x86 shuffle
35768 // instructions into higher-order shuffles. We do this after combining
35769 // specific PSHUF instruction sequences into their minimal form so that we
35770 // can evaluate how many specialized shuffle instructions are involved in
35771 // a particular chain.
35772 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
35773 return Res;
35774
35775 // Simplify source operands based on shuffle mask.
35776 // TODO - merge this into combineX86ShufflesRecursively.
35777 APInt KnownUndef, KnownZero;
35778 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
35779 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
35780 return SDValue(N, 0);
35781 }
35782
35783 // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
35784 // in the upper 64 bits.
35785 // TODO: Can we generalize this using computeKnownBits.
35786 if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
35787 (VT == MVT::v2f64 || VT == MVT::v2i64) &&
35788 N->getOperand(0).getOpcode() == ISD::BITCAST) {
35789 SDValue In = N->getOperand(0).getOperand(0);
35790 EVT InVT = In.getValueType();
35791 switch (In.getOpcode()) {
35792 default:
35793 break;
35794 case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
35795 case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
35796 case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
35797 case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
35798 case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
35799 case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
35800 case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
35801 if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) &&
35802 (In.getOperand(0).getValueType() == MVT::v2f64 ||
35803 In.getOperand(0).getValueType() == MVT::v2i64))
35804 return N->getOperand(0); // return the bitcast
35805 break;
35806 case X86ISD::STRICT_CVTTP2SI:
35807 case X86ISD::STRICT_CVTTP2UI:
35808 case X86ISD::STRICT_CVTSI2P:
35809 case X86ISD::STRICT_CVTUI2P:
35810 case X86ISD::STRICT_VFPROUND:
35811 if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) &&
35812 (In.getOperand(1).getValueType() == MVT::v2f64 ||
35813 In.getOperand(1).getValueType() == MVT::v2i64))
35814 return N->getOperand(0); // return the bitcast
35815 break;
35816 case X86ISD::CVTPS2PH:
35817 case X86ISD::MCVTPS2PH:
35818 if (InVT == MVT::v8i16 && In.getOperand(0).getValueType() == MVT::v4f32)
35819 return N->getOperand(0); // return the bitcast
35820 break;
35821 case X86ISD::STRICT_CVTPS2PH:
35822 if (InVT == MVT::v8i16 && In.getOperand(1).getValueType() == MVT::v4f32)
35823 return N->getOperand(0); // return the bitcast
35824 break;
35825 }
35826 }
35827
35828 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
35829 // insert into a zero vector. This helps get VZEXT_MOVL closer to
35830 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
35831 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
35832 if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
35833 N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
35834 N->getOperand(0).hasOneUse() &&
35835 N->getOperand(0).getOperand(0).isUndef() &&
35836 isNullConstant(N->getOperand(0).getOperand(2))) {
35837 SDValue In = N->getOperand(0).getOperand(1);
35838 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
35839 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
35840 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
35841 Movl, N->getOperand(0).getOperand(2));
35842 }
35843
35844 // If this a vzmovl of a full vector load, replace it with a vzload, unless
35845 // the load is volatile.
35846 if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
35847 ISD::isNormalLoad(N->getOperand(0).getNode())) {
35848 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
35849 if (LN->isSimple()) {
35850 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35851 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35852 SDValue VZLoad =
35853 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35854 VT.getVectorElementType(),
35855 LN->getPointerInfo(),
35856 LN->getAlignment(),
35857 MachineMemOperand::MOLoad);
35858 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
35859 return VZLoad;
35860 }
35861 }
35862
35863 return SDValue();
35864}
35865
35866bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
35867 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
35868 TargetLoweringOpt &TLO, unsigned Depth) const {
35869 int NumElts = DemandedElts.getBitWidth();
35870 unsigned Opc = Op.getOpcode();
35871 EVT VT = Op.getValueType();
35872
35873 // Handle special case opcodes.
35874 switch (Opc) {
35875 case X86ISD::PMULDQ:
35876 case X86ISD::PMULUDQ: {
35877 APInt LHSUndef, LHSZero;
35878 APInt RHSUndef, RHSZero;
35879 SDValue LHS = Op.getOperand(0);
35880 SDValue RHS = Op.getOperand(1);
35881 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
35882 Depth + 1))
35883 return true;
35884 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
35885 Depth + 1))
35886 return true;
35887 // Multiply by zero.
35888 KnownZero = LHSZero | RHSZero;
35889 break;
35890 }
35891 case X86ISD::VSHL:
35892 case X86ISD::VSRL:
35893 case X86ISD::VSRA: {
35894 // We only need the bottom 64-bits of the (128-bit) shift amount.
35895 SDValue Amt = Op.getOperand(1);
35896 MVT AmtVT = Amt.getSimpleValueType();
35897 assert(AmtVT.is128BitVector() && "Unexpected value type")((AmtVT.is128BitVector() && "Unexpected value type") ?
static_cast<void> (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35897, __PRETTY_FUNCTION__))
;
35898
35899 // If we reuse the shift amount just for sse shift amounts then we know that
35900 // only the bottom 64-bits are only ever used.
35901 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
35902 unsigned UseOpc = Use->getOpcode();
35903 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
35904 UseOpc == X86ISD::VSRA) &&
35905 Use->getOperand(0) != Amt;
35906 });
35907
35908 APInt AmtUndef, AmtZero;
35909 unsigned NumAmtElts = AmtVT.getVectorNumElements();
35910 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
35911 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
35912 Depth + 1, AssumeSingleUse))
35913 return true;
35914 LLVM_FALLTHROUGH[[gnu::fallthrough]];
35915 }
35916 case X86ISD::VSHLI:
35917 case X86ISD::VSRLI:
35918 case X86ISD::VSRAI: {
35919 SDValue Src = Op.getOperand(0);
35920 APInt SrcUndef;
35921 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
35922 Depth + 1))
35923 return true;
35924 // TODO convert SrcUndef to KnownUndef.
35925 break;
35926 }
35927 case X86ISD::KSHIFTL: {
35928 SDValue Src = Op.getOperand(0);
35929 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
35930 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35930, __PRETTY_FUNCTION__))
;
35931 unsigned ShiftAmt = Amt->getZExtValue();
35932
35933 if (ShiftAmt == 0)
35934 return TLO.CombineTo(Op, Src);
35935
35936 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
35937 // single shift. We can do this if the bottom bits (which are shifted
35938 // out) are never demanded.
35939 if (Src.getOpcode() == X86ISD::KSHIFTR) {
35940 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
35941 unsigned C1 = Src.getConstantOperandVal(1);
35942 unsigned NewOpc = X86ISD::KSHIFTL;
35943 int Diff = ShiftAmt - C1;
35944 if (Diff < 0) {
35945 Diff = -Diff;
35946 NewOpc = X86ISD::KSHIFTR;
35947 }
35948
35949 SDLoc dl(Op);
35950 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
35951 return TLO.CombineTo(
35952 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
35953 }
35954 }
35955
35956 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
35957 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
35958 Depth + 1))
35959 return true;
35960
35961 KnownUndef <<= ShiftAmt;
35962 KnownZero <<= ShiftAmt;
35963 KnownZero.setLowBits(ShiftAmt);
35964 break;
35965 }
35966 case X86ISD::KSHIFTR: {
35967 SDValue Src = Op.getOperand(0);
35968 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
35969 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35969, __PRETTY_FUNCTION__))
;
35970 unsigned ShiftAmt = Amt->getZExtValue();
35971
35972 if (ShiftAmt == 0)
35973 return TLO.CombineTo(Op, Src);
35974
35975 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
35976 // single shift. We can do this if the top bits (which are shifted
35977 // out) are never demanded.
35978 if (Src.getOpcode() == X86ISD::KSHIFTL) {
35979 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
35980 unsigned C1 = Src.getConstantOperandVal(1);
35981 unsigned NewOpc = X86ISD::KSHIFTR;
35982 int Diff = ShiftAmt - C1;
35983 if (Diff < 0) {
35984 Diff = -Diff;
35985 NewOpc = X86ISD::KSHIFTL;
35986 }
35987
35988 SDLoc dl(Op);
35989 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
35990 return TLO.CombineTo(
35991 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
35992 }
35993 }
35994
35995 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
35996 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
35997 Depth + 1))
35998 return true;
35999
36000 KnownUndef.lshrInPlace(ShiftAmt);
36001 KnownZero.lshrInPlace(ShiftAmt);
36002 KnownZero.setHighBits(ShiftAmt);
36003 break;
36004 }
36005 case X86ISD::CVTSI2P:
36006 case X86ISD::CVTUI2P: {
36007 SDValue Src = Op.getOperand(0);
36008 MVT SrcVT = Src.getSimpleValueType();
36009 APInt SrcUndef, SrcZero;
36010 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36011 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
36012 Depth + 1))
36013 return true;
36014 break;
36015 }
36016 case X86ISD::PACKSS:
36017 case X86ISD::PACKUS: {
36018 SDValue N0 = Op.getOperand(0);
36019 SDValue N1 = Op.getOperand(1);
36020
36021 APInt DemandedLHS, DemandedRHS;
36022 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36023
36024 APInt SrcUndef, SrcZero;
36025 if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
36026 Depth + 1))
36027 return true;
36028 if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
36029 Depth + 1))
36030 return true;
36031
36032 // Aggressively peek through ops to get at the demanded elts.
36033 // TODO - we should do this for all target/faux shuffles ops.
36034 if (!DemandedElts.isAllOnesValue()) {
36035 APInt DemandedSrcBits =
36036 APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
36037 SDValue NewN0 = SimplifyMultipleUseDemandedBits(
36038 N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
36039 SDValue NewN1 = SimplifyMultipleUseDemandedBits(
36040 N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
36041 if (NewN0 || NewN1) {
36042 NewN0 = NewN0 ? NewN0 : N0;
36043 NewN1 = NewN1 ? NewN1 : N1;
36044 return TLO.CombineTo(Op,
36045 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
36046 }
36047 }
36048 break;
36049 }
36050 case X86ISD::HADD:
36051 case X86ISD::HSUB:
36052 case X86ISD::FHADD:
36053 case X86ISD::FHSUB: {
36054 APInt DemandedLHS, DemandedRHS;
36055 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36056
36057 APInt LHSUndef, LHSZero;
36058 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
36059 LHSZero, TLO, Depth + 1))
36060 return true;
36061 APInt RHSUndef, RHSZero;
36062 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
36063 RHSZero, TLO, Depth + 1))
36064 return true;
36065 break;
36066 }
36067 case X86ISD::VTRUNC:
36068 case X86ISD::VTRUNCS:
36069 case X86ISD::VTRUNCUS: {
36070 SDValue Src = Op.getOperand(0);
36071 MVT SrcVT = Src.getSimpleValueType();
36072 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36073 APInt SrcUndef, SrcZero;
36074 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
36075 Depth + 1))
36076 return true;
36077 KnownZero = SrcZero.zextOrTrunc(NumElts);
36078 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
36079 break;
36080 }
36081 case X86ISD::BLENDV: {
36082 APInt SelUndef, SelZero;
36083 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
36084 SelZero, TLO, Depth + 1))
36085 return true;
36086
36087 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
36088 APInt LHSUndef, LHSZero;
36089 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
36090 LHSZero, TLO, Depth + 1))
36091 return true;
36092
36093 APInt RHSUndef, RHSZero;
36094 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
36095 RHSZero, TLO, Depth + 1))
36096 return true;
36097
36098 KnownZero = LHSZero & RHSZero;
36099 KnownUndef = LHSUndef & RHSUndef;
36100 break;
36101 }
36102 case X86ISD::VBROADCAST: {
36103 SDValue Src = Op.getOperand(0);
36104 MVT SrcVT = Src.getSimpleValueType();
36105 if (!SrcVT.isVector())
36106 return false;
36107 // Don't bother broadcasting if we just need the 0'th element.
36108 if (DemandedElts == 1) {
36109 if (Src.getValueType() != VT)
36110 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
36111 SDLoc(Op));
36112 return TLO.CombineTo(Op, Src);
36113 }
36114 APInt SrcUndef, SrcZero;
36115 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
36116 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
36117 Depth + 1))
36118 return true;
36119 break;
36120 }
36121 case X86ISD::VPERMV: {
36122 SDValue Mask = Op.getOperand(0);
36123 APInt MaskUndef, MaskZero;
36124 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36125 Depth + 1))
36126 return true;
36127 break;
36128 }
36129 case X86ISD::PSHUFB:
36130 case X86ISD::VPERMV3:
36131 case X86ISD::VPERMILPV: {
36132 SDValue Mask = Op.getOperand(1);
36133 APInt MaskUndef, MaskZero;
36134 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36135 Depth + 1))
36136 return true;
36137 break;
36138 }
36139 case X86ISD::VPPERM:
36140 case X86ISD::VPERMIL2: {
36141 SDValue Mask = Op.getOperand(2);
36142 APInt MaskUndef, MaskZero;
36143 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36144 Depth + 1))
36145 return true;
36146 break;
36147 }
36148 }
36149
36150 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
36151 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
36152 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
36153 if ((VT.is256BitVector() || VT.is512BitVector()) &&
36154 DemandedElts.lshr(NumElts / 2) == 0) {
36155 unsigned SizeInBits = VT.getSizeInBits();
36156 unsigned ExtSizeInBits = SizeInBits / 2;
36157
36158 // See if 512-bit ops only use the bottom 128-bits.
36159 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
36160 ExtSizeInBits = SizeInBits / 4;
36161
36162 switch (Opc) {
36163 // Zero upper elements.
36164 case X86ISD::VZEXT_MOVL: {
36165 SDLoc DL(Op);
36166 SDValue Ext0 =
36167 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36168 SDValue ExtOp =
36169 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
36170 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36171 SDValue Insert =
36172 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36173 return TLO.CombineTo(Op, Insert);
36174 }
36175 // Subvector broadcast.
36176 case X86ISD::SUBV_BROADCAST: {
36177 SDLoc DL(Op);
36178 SDValue Src = Op.getOperand(0);
36179 if (Src.getValueSizeInBits() > ExtSizeInBits)
36180 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
36181 else if (Src.getValueSizeInBits() < ExtSizeInBits) {
36182 MVT SrcSVT = Src.getSimpleValueType().getScalarType();
36183 MVT SrcVT =
36184 MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
36185 Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
36186 }
36187 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
36188 TLO.DAG, DL, ExtSizeInBits));
36189 }
36190 // Byte shifts by immediate.
36191 case X86ISD::VSHLDQ:
36192 case X86ISD::VSRLDQ:
36193 // Shift by uniform.
36194 case X86ISD::VSHL:
36195 case X86ISD::VSRL:
36196 case X86ISD::VSRA:
36197 // Shift by immediate.
36198 case X86ISD::VSHLI:
36199 case X86ISD::VSRLI:
36200 case X86ISD::VSRAI: {
36201 SDLoc DL(Op);
36202 SDValue Ext0 =
36203 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36204 SDValue ExtOp =
36205 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
36206 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36207 SDValue Insert =
36208 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36209 return TLO.CombineTo(Op, Insert);
36210 }
36211 case X86ISD::VPERMI: {
36212 // Simplify PERMPD/PERMQ to extract_subvector.
36213 // TODO: This should be done in shuffle combining.
36214 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
36215 SmallVector<int, 4> Mask;
36216 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
36217 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
36218 SDLoc DL(Op);
36219 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
36220 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36221 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
36222 return TLO.CombineTo(Op, Insert);
36223 }
36224 }
36225 break;
36226 }
36227 // Target Shuffles.
36228 case X86ISD::PSHUFB:
36229 case X86ISD::UNPCKL:
36230 case X86ISD::UNPCKH:
36231 // Saturated Packs.
36232 case X86ISD::PACKSS:
36233 case X86ISD::PACKUS:
36234 // Horizontal Ops.
36235 case X86ISD::HADD:
36236 case X86ISD::HSUB:
36237 case X86ISD::FHADD:
36238 case X86ISD::FHSUB: {
36239 SDLoc DL(Op);
36240 MVT ExtVT = VT.getSimpleVT();
36241 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
36242 ExtSizeInBits / ExtVT.getScalarSizeInBits());
36243 SDValue Ext0 =
36244 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36245 SDValue Ext1 =
36246 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
36247 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
36248 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36249 SDValue Insert =
36250 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36251 return TLO.CombineTo(Op, Insert);
36252 }
36253 }
36254 }
36255
36256 // Get target/faux shuffle mask.
36257 APInt OpUndef, OpZero;
36258 SmallVector<int, 64> OpMask;
36259 SmallVector<SDValue, 2> OpInputs;
36260 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
36261 OpZero, TLO.DAG, Depth, false))
36262 return false;
36263
36264 // Shuffle inputs must be the same size as the result.
36265 if (OpMask.size() != (unsigned)NumElts ||
36266 llvm::any_of(OpInputs, [VT](SDValue V) {
36267 return VT.getSizeInBits() != V.getValueSizeInBits() ||
36268 !V.getValueType().isVector();
36269 }))
36270 return false;
36271
36272 KnownZero = OpZero;
36273 KnownUndef = OpUndef;
36274
36275 // Check if shuffle mask can be simplified to undef/zero/identity.
36276 int NumSrcs = OpInputs.size();
36277 for (int i = 0; i != NumElts; ++i)
36278 if (!DemandedElts[i])
36279 OpMask[i] = SM_SentinelUndef;
36280
36281 if (isUndefInRange(OpMask, 0, NumElts)) {
36282 KnownUndef.setAllBits();
36283 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
36284 }
36285 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
36286 KnownZero.setAllBits();
36287 return TLO.CombineTo(
36288 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
36289 }
36290 for (int Src = 0; Src != NumSrcs; ++Src)
36291 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
36292 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
36293
36294 // Attempt to simplify inputs.
36295 for (int Src = 0; Src != NumSrcs; ++Src) {
36296 // TODO: Support inputs of different types.
36297 if (OpInputs[Src].getValueType() != VT)
36298 continue;
36299
36300 int Lo = Src * NumElts;
36301 APInt SrcElts = APInt::getNullValue(NumElts);
36302 for (int i = 0; i != NumElts; ++i)
36303 if (DemandedElts[i]) {
36304 int M = OpMask[i] - Lo;
36305 if (0 <= M && M < NumElts)
36306 SrcElts.setBit(M);
36307 }
36308
36309 // TODO - Propagate input undef/zero elts.
36310 APInt SrcUndef, SrcZero;
36311 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
36312 TLO, Depth + 1))
36313 return true;
36314 }
36315
36316 // If we don't demand all elements, then attempt to combine to a simpler
36317 // shuffle.
36318 // TODO: Handle other depths, but first we need to handle the fact that
36319 // it might combine to the same shuffle.
36320 if (!DemandedElts.isAllOnesValue() && Depth == 0) {
36321 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
36322 for (int i = 0; i != NumElts; ++i)
36323 if (DemandedElts[i])
36324 DemandedMask[i] = i;
36325
36326 SDValue NewShuffle = combineX86ShufflesRecursively(
36327 {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
36328 /*AllowVarMask*/ true, TLO.DAG, Subtarget);
36329 if (NewShuffle)
36330 return TLO.CombineTo(Op, NewShuffle);
36331 }
36332
36333 return false;
36334}
36335
36336bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
36337 SDValue Op, const APInt &OriginalDemandedBits,
36338 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
36339 unsigned Depth) const {
36340 EVT VT = Op.getValueType();
36341 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
36342 unsigned Opc = Op.getOpcode();
36343 switch(Opc) {
36344 case X86ISD::PMULDQ:
36345 case X86ISD::PMULUDQ: {
36346 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
36347 KnownBits KnownOp;
36348 SDValue LHS = Op.getOperand(0);
36349 SDValue RHS = Op.getOperand(1);
36350 // FIXME: Can we bound this better?
36351 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
36352 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
36353 TLO, Depth + 1))
36354 return true;
36355 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
36356 TLO, Depth + 1))
36357 return true;
36358
36359 // Aggressively peek through ops to get at the demanded low bits.
36360 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
36361 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
36362 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
36363 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
36364 if (DemandedLHS || DemandedRHS) {
36365 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
36366 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
36367 return TLO.CombineTo(
36368 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
36369 }
36370 break;
36371 }
36372 case X86ISD::VSHLI: {
36373 SDValue Op0 = Op.getOperand(0);
36374
36375 unsigned ShAmt = Op.getConstantOperandVal(1);
36376 if (ShAmt >= BitWidth)
36377 break;
36378
36379 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
36380
36381 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
36382 // single shift. We can do this if the bottom bits (which are shifted
36383 // out) are never demanded.
36384 if (Op0.getOpcode() == X86ISD::VSRLI &&
36385 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
36386 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
36387 if (Shift2Amt < BitWidth) {
36388 int Diff = ShAmt - Shift2Amt;
36389 if (Diff == 0)
36390 return TLO.CombineTo(Op, Op0.getOperand(0));
36391
36392 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
36393 SDValue NewShift = TLO.DAG.getNode(
36394 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
36395 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
36396 return TLO.CombineTo(Op, NewShift);
36397 }
36398 }
36399
36400 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
36401 TLO, Depth + 1))
36402 return true;
36403
36404 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36404, __PRETTY_FUNCTION__))
;
36405 Known.Zero <<= ShAmt;
36406 Known.One <<= ShAmt;
36407
36408 // Low bits known zero.
36409 Known.Zero.setLowBits(ShAmt);
36410 break;
36411 }
36412 case X86ISD::VSRLI: {
36413 unsigned ShAmt = Op.getConstantOperandVal(1);
36414 if (ShAmt >= BitWidth)
36415 break;
36416
36417 APInt DemandedMask = OriginalDemandedBits << ShAmt;
36418
36419 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
36420 OriginalDemandedElts, Known, TLO, Depth + 1))
36421 return true;
36422
36423 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36423, __PRETTY_FUNCTION__))
;
36424 Known.Zero.lshrInPlace(ShAmt);
36425 Known.One.lshrInPlace(ShAmt);
36426
36427 // High bits known zero.
36428 Known.Zero.setHighBits(ShAmt);
36429 break;
36430 }
36431 case X86ISD::VSRAI: {
36432 SDValue Op0 = Op.getOperand(0);
36433 SDValue Op1 = Op.getOperand(1);
36434
36435 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
36436 if (ShAmt >= BitWidth)
36437 break;
36438
36439 APInt DemandedMask = OriginalDemandedBits << ShAmt;
36440
36441 // If we just want the sign bit then we don't need to shift it.
36442 if (OriginalDemandedBits.isSignMask())
36443 return TLO.CombineTo(Op, Op0);
36444
36445 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
36446 if (Op0.getOpcode() == X86ISD::VSHLI &&
36447 Op.getOperand(1) == Op0.getOperand(1)) {
36448 SDValue Op00 = Op0.getOperand(0);
36449 unsigned NumSignBits =
36450 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
36451 if (ShAmt < NumSignBits)
36452 return TLO.CombineTo(Op, Op00);
36453 }
36454
36455 // If any of the demanded bits are produced by the sign extension, we also
36456 // demand the input sign bit.
36457 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
36458 DemandedMask.setSignBit();
36459
36460 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
36461 TLO, Depth + 1))
36462 return true;
36463
36464 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36464, __PRETTY_FUNCTION__))
;
36465 Known.Zero.lshrInPlace(ShAmt);
36466 Known.One.lshrInPlace(ShAmt);
36467
36468 // If the input sign bit is known to be zero, or if none of the top bits
36469 // are demanded, turn this into an unsigned shift right.
36470 if (Known.Zero[BitWidth - ShAmt - 1] ||
36471 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
36472 return TLO.CombineTo(
36473 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
36474
36475 // High bits are known one.
36476 if (Known.One[BitWidth - ShAmt - 1])
36477 Known.One.setHighBits(ShAmt);
36478 break;
36479 }
36480 case X86ISD::PEXTRB:
36481 case X86ISD::PEXTRW: {
36482 SDValue Vec = Op.getOperand(0);
36483 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36484 MVT VecVT = Vec.getSimpleValueType();
36485 unsigned NumVecElts = VecVT.getVectorNumElements();
36486
36487 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
36488 unsigned Idx = CIdx->getZExtValue();
36489 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
36490
36491 // If we demand no bits from the vector then we must have demanded
36492 // bits from the implict zext - simplify to zero.
36493 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
36494 if (DemandedVecBits == 0)
36495 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36496
36497 APInt KnownUndef, KnownZero;
36498 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
36499 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
36500 KnownZero, TLO, Depth + 1))
36501 return true;
36502
36503 KnownBits KnownVec;
36504 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
36505 KnownVec, TLO, Depth + 1))
36506 return true;
36507
36508 if (SDValue V = SimplifyMultipleUseDemandedBits(
36509 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
36510 return TLO.CombineTo(
36511 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
36512
36513 Known = KnownVec.zext(BitWidth);
36514 return false;
36515 }
36516 break;
36517 }
36518 case X86ISD::PINSRB:
36519 case X86ISD::PINSRW: {
36520 SDValue Vec = Op.getOperand(0);
36521 SDValue Scl = Op.getOperand(1);
36522 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
36523 MVT VecVT = Vec.getSimpleValueType();
36524
36525 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
36526 unsigned Idx = CIdx->getZExtValue();
36527 if (!OriginalDemandedElts[Idx])
36528 return TLO.CombineTo(Op, Vec);
36529
36530 KnownBits KnownVec;
36531 APInt DemandedVecElts(OriginalDemandedElts);
36532 DemandedVecElts.clearBit(Idx);
36533 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
36534 KnownVec, TLO, Depth + 1))
36535 return true;
36536
36537 KnownBits KnownScl;
36538 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
36539 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
36540 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
36541 return true;
36542
36543 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
36544 Known.One = KnownVec.One & KnownScl.One;
36545 Known.Zero = KnownVec.Zero & KnownScl.Zero;
36546 return false;
36547 }
36548 break;
36549 }
36550 case X86ISD::PACKSS:
36551 // PACKSS saturates to MIN/MAX integer values. So if we just want the
36552 // sign bit then we can just ask for the source operands sign bit.
36553 // TODO - add known bits handling.
36554 if (OriginalDemandedBits.isSignMask()) {
36555 APInt DemandedLHS, DemandedRHS;
36556 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
36557
36558 KnownBits KnownLHS, KnownRHS;
36559 APInt SignMask = APInt::getSignMask(BitWidth * 2);
36560 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
36561 KnownLHS, TLO, Depth + 1))
36562 return true;
36563 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
36564 KnownRHS, TLO, Depth + 1))
36565 return true;
36566
36567 // Attempt to avoid multi-use ops if we don't need anything from them.
36568 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
36569 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
36570 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
36571 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
36572 if (DemandedOp0 || DemandedOp1) {
36573 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
36574 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
36575 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
36576 }
36577 }
36578 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
36579 break;
36580 case X86ISD::PCMPGT:
36581 // icmp sgt(0, R) == ashr(R, BitWidth-1).
36582 // iff we only need the sign bit then we can use R directly.
36583 if (OriginalDemandedBits.isSignMask() &&
36584 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
36585 return TLO.CombineTo(Op, Op.getOperand(1));
36586 break;
36587 case X86ISD::MOVMSK: {
36588 SDValue Src = Op.getOperand(0);
36589 MVT SrcVT = Src.getSimpleValueType();
36590 unsigned SrcBits = SrcVT.getScalarSizeInBits();
36591 unsigned NumElts = SrcVT.getVectorNumElements();
36592
36593 // If we don't need the sign bits at all just return zero.
36594 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
36595 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36596
36597 // Only demand the vector elements of the sign bits we need.
36598 APInt KnownUndef, KnownZero;
36599 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
36600 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
36601 TLO, Depth + 1))
36602 return true;
36603
36604 Known.Zero = KnownZero.zextOrSelf(BitWidth);
36605 Known.Zero.setHighBits(BitWidth - NumElts);
36606
36607 // MOVMSK only uses the MSB from each vector element.
36608 KnownBits KnownSrc;
36609 if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
36610 KnownSrc, TLO, Depth + 1))
36611 return true;
36612
36613 if (KnownSrc.One[SrcBits - 1])
36614 Known.One.setLowBits(NumElts);
36615 else if (KnownSrc.Zero[SrcBits - 1])
36616 Known.Zero.setLowBits(NumElts);
36617 return false;
36618 }
36619 case X86ISD::BEXTR: {
36620 SDValue Op0 = Op.getOperand(0);
36621 SDValue Op1 = Op.getOperand(1);
36622
36623 // Only bottom 16-bits of the control bits are required.
36624 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36625 // NOTE: SimplifyDemandedBits won't do this for constants.
36626 const APInt &Val1 = Cst1->getAPIntValue();
36627 APInt MaskedVal1 = Val1 & 0xFFFF;
36628 if (MaskedVal1 != Val1) {
36629 SDLoc DL(Op);
36630 return TLO.CombineTo(
36631 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
36632 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
36633 }
36634 }
36635
36636 KnownBits Known1;
36637 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
36638 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
36639 return true;
36640
36641 // If the length is 0, replace with 0.
36642 KnownBits LengthBits = Known1.extractBits(8, 8);
36643 if (LengthBits.isZero())
36644 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36645
36646 break;
36647 }
36648 }
36649
36650 return TargetLowering::SimplifyDemandedBitsForTargetNode(
36651 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
36652}
36653
36654SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
36655 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
36656 SelectionDAG &DAG, unsigned Depth) const {
36657 int NumElts = DemandedElts.getBitWidth();
36658 unsigned Opc = Op.getOpcode();
36659 EVT VT = Op.getValueType();
36660
36661 switch (Opc) {
36662 case X86ISD::PINSRB:
36663 case X86ISD::PINSRW: {
36664 // If we don't demand the inserted element, return the base vector.
36665 SDValue Vec = Op.getOperand(0);
36666 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
36667 MVT VecVT = Vec.getSimpleValueType();
36668 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
36669 !DemandedElts[CIdx->getZExtValue()])
36670 return Vec;
36671 break;
36672 }
36673 case X86ISD::PCMPGT:
36674 // icmp sgt(0, R) == ashr(R, BitWidth-1).
36675 // iff we only need the sign bit then we can use R directly.
36676 if (DemandedBits.isSignMask() &&
36677 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
36678 return Op.getOperand(1);
36679 break;
36680 }
36681
36682 APInt ShuffleUndef, ShuffleZero;
36683 SmallVector<int, 16> ShuffleMask;
36684 SmallVector<SDValue, 2> ShuffleOps;
36685 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
36686 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
36687 // If all the demanded elts are from one operand and are inline,
36688 // then we can use the operand directly.
36689 int NumOps = ShuffleOps.size();
36690 if (ShuffleMask.size() == (unsigned)NumElts &&
36691 llvm::all_of(ShuffleOps, [VT](SDValue V) {
36692 return VT.getSizeInBits() == V.getValueSizeInBits();
36693 })) {
36694
36695 if (DemandedElts.isSubsetOf(ShuffleUndef))
36696 return DAG.getUNDEF(VT);
36697 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
36698 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
36699
36700 // Bitmask that indicates which ops have only been accessed 'inline'.
36701 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
36702 for (int i = 0; i != NumElts; ++i) {
36703 int M = ShuffleMask[i];
36704 if (!DemandedElts[i] || ShuffleUndef[i])
36705 continue;
36706 int Op = M / NumElts;
36707 int Index = M % NumElts;
36708 if (M < 0 || Index != i) {
36709 IdentityOp.clearAllBits();
36710 break;
36711 }
36712 IdentityOp &= APInt::getOneBitSet(NumOps, Op);
36713 if (IdentityOp == 0)
36714 break;
36715 }
36716 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36717, __PRETTY_FUNCTION__))
36717 "Multiple identity shuffles detected")(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36717, __PRETTY_FUNCTION__))
;
36718
36719 if (IdentityOp != 0)
36720 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
36721 }
36722 }
36723
36724 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
36725 Op, DemandedBits, DemandedElts, DAG, Depth);
36726}
36727
36728// Helper to peek through bitops/setcc to determine size of source vector.
36729// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
36730static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
36731 switch (Src.getOpcode()) {
36732 case ISD::SETCC:
36733 return Src.getOperand(0).getValueSizeInBits() == Size;
36734 case ISD::AND:
36735 case ISD::XOR:
36736 case ISD::OR:
36737 return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
36738 checkBitcastSrcVectorSize(Src.getOperand(1), Size);
36739 }
36740 return false;
36741}
36742
36743// Helper to push sign extension of vXi1 SETCC result through bitops.
36744static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
36745 SDValue Src, const SDLoc &DL) {
36746 switch (Src.getOpcode()) {
36747 case ISD::SETCC:
36748 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
36749 case ISD::AND:
36750 case ISD::XOR:
36751 case ISD::OR:
36752 return DAG.getNode(
36753 Src.getOpcode(), DL, SExtVT,
36754 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
36755 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
36756 }
36757 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36757)
;
36758}
36759
36760// Try to match patterns such as
36761// (i16 bitcast (v16i1 x))
36762// ->
36763// (i16 movmsk (16i8 sext (v16i1 x)))
36764// before the illegal vector is scalarized on subtargets that don't have legal
36765// vxi1 types.
36766static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
36767 const SDLoc &DL,
36768 const X86Subtarget &Subtarget) {
36769 EVT SrcVT = Src.getValueType();
36770 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
36771 return SDValue();
36772
36773 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
36774 // movmskb even with avx512. This will be better than truncating to vXi1 and
36775 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
36776 // vpcmpeqb/vpcmpgtb.
36777 bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
36778 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
36779 Src.getOperand(0).getValueType() == MVT::v32i8 ||
36780 Src.getOperand(0).getValueType() == MVT::v64i8);
36781
36782 // With AVX512 vxi1 types are legal and we prefer using k-regs.
36783 // MOVMSK is supported in SSE2 or later.
36784 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
36785 return SDValue();
36786
36787 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
36788 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
36789 // v8i16 and v16i16.
36790 // For these two cases, we can shuffle the upper element bytes to a
36791 // consecutive sequence at the start of the vector and treat the results as
36792 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
36793 // for v16i16 this is not the case, because the shuffle is expensive, so we
36794 // avoid sign-extending to this type entirely.
36795 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
36796 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
36797 MVT SExtVT;
36798 bool PropagateSExt = false;
36799 switch (SrcVT.getSimpleVT().SimpleTy) {
36800 default:
36801 return SDValue();
36802 case MVT::v2i1:
36803 SExtVT = MVT::v2i64;
36804 break;
36805 case MVT::v4i1:
36806 SExtVT = MVT::v4i32;
36807 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
36808 // sign-extend to a 256-bit operation to avoid truncation.
36809 if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
36810 SExtVT = MVT::v4i64;
36811 PropagateSExt = true;
36812 }
36813 break;
36814 case MVT::v8i1:
36815 SExtVT = MVT::v8i16;
36816 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
36817 // sign-extend to a 256-bit operation to match the compare.
36818 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
36819 // 256-bit because the shuffle is cheaper than sign extending the result of
36820 // the compare.
36821 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
36822 checkBitcastSrcVectorSize(Src, 512))) {
36823 SExtVT = MVT::v8i32;
36824 PropagateSExt = true;
36825 }
36826 break;
36827 case MVT::v16i1:
36828 SExtVT = MVT::v16i8;
36829 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
36830 // it is not profitable to sign-extend to 256-bit because this will
36831 // require an extra cross-lane shuffle which is more expensive than
36832 // truncating the result of the compare to 128-bits.
36833 break;
36834 case MVT::v32i1:
36835 SExtVT = MVT::v32i8;
36836 break;
36837 case MVT::v64i1:
36838 // If we have AVX512F, but not AVX512BW and the input is truncated from
36839 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
36840 if (Subtarget.hasAVX512()) {
36841 if (Subtarget.hasBWI())
36842 return SDValue();
36843 SExtVT = MVT::v64i8;
36844 break;
36845 }
36846 // Split if this is a <64 x i8> comparison result.
36847 if (checkBitcastSrcVectorSize(Src, 512)) {
36848 SExtVT = MVT::v64i8;
36849 break;
36850 }
36851 return SDValue();
36852 };
36853
36854 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
36855 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
36856
36857 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
36858 V = getPMOVMSKB(DL, V, DAG, Subtarget);
36859 } else {
36860 if (SExtVT == MVT::v8i16)
36861 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
36862 DAG.getUNDEF(MVT::v8i16));
36863 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
36864 }
36865
36866 EVT IntVT =
36867 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
36868 V = DAG.getZExtOrTrunc(V, DL, IntVT);
36869 return DAG.getBitcast(VT, V);
36870}
36871
36872// Convert a vXi1 constant build vector to the same width scalar integer.
36873static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
36874 EVT SrcVT = Op.getValueType();
36875 assert(SrcVT.getVectorElementType() == MVT::i1 &&((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36876, __PRETTY_FUNCTION__))
36876 "Expected a vXi1 vector")((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36876, __PRETTY_FUNCTION__))
;
36877 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36878, __PRETTY_FUNCTION__))
36878 "Expected a constant build vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36878, __PRETTY_FUNCTION__))
;
36879
36880 APInt Imm(SrcVT.getVectorNumElements(), 0);
36881 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
36882 SDValue In = Op.getOperand(Idx);
36883 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
36884 Imm.setBit(Idx);
36885 }
36886 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
36887 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
36888}
36889
36890static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
36891 TargetLowering::DAGCombinerInfo &DCI,
36892 const X86Subtarget &Subtarget) {
36893 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((N->getOpcode() == ISD::BITCAST && "Expected a bitcast"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36893, __PRETTY_FUNCTION__))
;
36894
36895 if (!DCI.isBeforeLegalizeOps())
36896 return SDValue();
36897
36898 // Only do this if we have k-registers.
36899 if (!Subtarget.hasAVX512())
36900 return SDValue();
36901
36902 EVT DstVT = N->getValueType(0);
36903 SDValue Op = N->getOperand(0);
36904 EVT SrcVT = Op.getValueType();
36905
36906 if (!Op.hasOneUse())
36907 return SDValue();
36908
36909 // Look for logic ops.
36910 if (Op.getOpcode() != ISD::AND &&
36911 Op.getOpcode() != ISD::OR &&
36912 Op.getOpcode() != ISD::XOR)
36913 return SDValue();
36914
36915 // Make sure we have a bitcast between mask registers and a scalar type.
36916 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
36917 DstVT.isScalarInteger()) &&
36918 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
36919 SrcVT.isScalarInteger()))
36920 return SDValue();
36921
36922 SDValue LHS = Op.getOperand(0);
36923 SDValue RHS = Op.getOperand(1);
36924
36925 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
36926 LHS.getOperand(0).getValueType() == DstVT)
36927 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
36928 DAG.getBitcast(DstVT, RHS));
36929
36930 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
36931 RHS.getOperand(0).getValueType() == DstVT)
36932 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
36933 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
36934
36935 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
36936 // Most of these have to move a constant from the scalar domain anyway.
36937 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
36938 RHS = combinevXi1ConstantToInteger(RHS, DAG);
36939 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
36940 DAG.getBitcast(DstVT, LHS), RHS);
36941 }
36942
36943 return SDValue();
36944}
36945
36946static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
36947 const X86Subtarget &Subtarget) {
36948 SDLoc DL(BV);
36949 unsigned NumElts = BV->getNumOperands();
36950 SDValue Splat = BV->getSplatValue();
36951
36952 // Build MMX element from integer GPR or SSE float values.
36953 auto CreateMMXElement = [&](SDValue V) {
36954 if (V.isUndef())
36955 return DAG.getUNDEF(MVT::x86mmx);
36956 if (V.getValueType().isFloatingPoint()) {
36957 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
36958 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
36959 V = DAG.getBitcast(MVT::v2i64, V);
36960 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
36961 }
36962 V = DAG.getBitcast(MVT::i32, V);
36963 } else {
36964 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
36965 }
36966 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
36967 };
36968
36969 // Convert build vector ops to MMX data in the bottom elements.
36970 SmallVector<SDValue, 8> Ops;
36971
36972 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
36973 if (Splat) {
36974 if (Splat.isUndef())
36975 return DAG.getUNDEF(MVT::x86mmx);
36976
36977 Splat = CreateMMXElement(Splat);
36978
36979 if (Subtarget.hasSSE1()) {
36980 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
36981 if (NumElts == 8)
36982 Splat = DAG.getNode(
36983 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
36984 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
36985 Splat);
36986
36987 // Use PSHUFW to repeat 16-bit elements.
36988 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
36989 return DAG.getNode(
36990 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
36991 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
36992 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
36993 }
36994 Ops.append(NumElts, Splat);
36995 } else {
36996 for (unsigned i = 0; i != NumElts; ++i)
36997 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
36998 }
36999
37000 // Use tree of PUNPCKLs to build up general MMX vector.
37001 while (Ops.size() > 1) {
37002 unsigned NumOps = Ops.size();
37003 unsigned IntrinOp =
37004 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
37005 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
37006 : Intrinsic::x86_mmx_punpcklbw));
37007 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
37008 for (unsigned i = 0; i != NumOps; i += 2)
37009 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
37010 Ops[i], Ops[i + 1]);
37011 Ops.resize(NumOps / 2);
37012 }
37013
37014 return Ops[0];
37015}
37016
37017static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
37018 TargetLowering::DAGCombinerInfo &DCI,
37019 const X86Subtarget &Subtarget) {
37020 SDValue N0 = N->getOperand(0);
37021 EVT VT = N->getValueType(0);
37022 EVT SrcVT = N0.getValueType();
37023
37024 // Try to match patterns such as
37025 // (i16 bitcast (v16i1 x))
37026 // ->
37027 // (i16 movmsk (16i8 sext (v16i1 x)))
37028 // before the setcc result is scalarized on subtargets that don't have legal
37029 // vxi1 types.
37030 if (DCI.isBeforeLegalize()) {
37031 SDLoc dl(N);
37032 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
37033 return V;
37034
37035 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
37036 // legalization destroys the v4i32 type.
37037 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
37038 VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
37039 N0.getOperand(0).getValueType() == MVT::v4i32 &&
37040 ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
37041 cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
37042 SDValue N00 = N0.getOperand(0);
37043 // Only do this if we can avoid scalarizing the input.
37044 if (ISD::isNormalLoad(N00.getNode()) ||
37045 (N00.getOpcode() == ISD::BITCAST &&
37046 N00.getOperand(0).getValueType() == MVT::v4f32)) {
37047 SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
37048 DAG.getBitcast(MVT::v4f32, N00));
37049 return DAG.getZExtOrTrunc(V, dl, VT);
37050 }
37051 }
37052
37053 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
37054 // type, widen both sides to avoid a trip through memory.
37055 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
37056 Subtarget.hasAVX512()) {
37057 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
37058 N0 = DAG.getBitcast(MVT::v8i1, N0);
37059 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
37060 DAG.getIntPtrConstant(0, dl));
37061 }
37062
37063 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
37064 // type, widen both sides to avoid a trip through memory.
37065 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
37066 Subtarget.hasAVX512()) {
37067 // Use zeros for the widening if we already have some zeroes. This can
37068 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
37069 // stream of this.
37070 // FIXME: It might make sense to detect a concat_vectors with a mix of
37071 // zeroes and undef and turn it into insert_subvector for i1 vectors as
37072 // a separate combine. What we can't do is canonicalize the operands of
37073 // such a concat or we'll get into a loop with SimplifyDemandedBits.
37074 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
37075 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
37076 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
37077 SrcVT = LastOp.getValueType();
37078 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
37079 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
37080 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
37081 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
37082 N0 = DAG.getBitcast(MVT::i8, N0);
37083 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
37084 }
37085 }
37086
37087 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
37088 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
37089 Ops[0] = N0;
37090 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
37091 N0 = DAG.getBitcast(MVT::i8, N0);
37092 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
37093 }
37094 }
37095
37096 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
37097 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
37098 // due to insert_subvector legalization on KNL. By promoting the copy to i16
37099 // we can help with known bits propagation from the vXi1 domain to the
37100 // scalar domain.
37101 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
37102 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37103 N0.getOperand(0).getValueType() == MVT::v16i1 &&
37104 isNullConstant(N0.getOperand(1)))
37105 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
37106 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
37107
37108 // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
37109 // determines // the number of bits loaded. Remaining bits are zero.
37110 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
37111 VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
37112 auto *BCast = cast<MemIntrinsicSDNode>(N0);
37113 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37114 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
37115 SDValue ResNode =
37116 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
37117 VT.getVectorElementType(),
37118 BCast->getMemOperand());
37119 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
37120 return ResNode;
37121 }
37122
37123 // Since MMX types are special and don't usually play with other vector types,
37124 // it's better to handle them early to be sure we emit efficient code by
37125 // avoiding store-load conversions.
37126 if (VT == MVT::x86mmx) {
37127 // Detect MMX constant vectors.
37128 APInt UndefElts;
37129 SmallVector<APInt, 1> EltBits;
37130 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
37131 SDLoc DL(N0);
37132 // Handle zero-extension of i32 with MOVD.
37133 if (EltBits[0].countLeadingZeros() >= 32)
37134 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
37135 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
37136 // Else, bitcast to a double.
37137 // TODO - investigate supporting sext 32-bit immediates on x86_64.
37138 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
37139 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
37140 }
37141
37142 // Detect bitcasts to x86mmx low word.
37143 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
37144 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
37145 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
37146 bool LowUndef = true, AllUndefOrZero = true;
37147 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
37148 SDValue Op = N0.getOperand(i);
37149 LowUndef &= Op.isUndef() || (i >= e/2);
37150 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
37151 }
37152 if (AllUndefOrZero) {
37153 SDValue N00 = N0.getOperand(0);
37154 SDLoc dl(N00);
37155 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
37156 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
37157 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
37158 }
37159 }
37160
37161 // Detect bitcasts of 64-bit build vectors and convert to a
37162 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
37163 // lowest element.
37164 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
37165 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
37166 SrcVT == MVT::v8i8))
37167 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
37168
37169 // Detect bitcasts between element or subvector extraction to x86mmx.
37170 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
37171 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
37172 isNullConstant(N0.getOperand(1))) {
37173 SDValue N00 = N0.getOperand(0);
37174 if (N00.getValueType().is128BitVector())
37175 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
37176 DAG.getBitcast(MVT::v2i64, N00));
37177 }
37178
37179 // Detect bitcasts from FP_TO_SINT to x86mmx.
37180 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
37181 SDLoc DL(N0);
37182 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
37183 DAG.getUNDEF(MVT::v2i32));
37184 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
37185 DAG.getBitcast(MVT::v2i64, Res));
37186 }
37187 }
37188
37189 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
37190 // most of these to scalar anyway.
37191 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
37192 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
37193 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
37194 return combinevXi1ConstantToInteger(N0, DAG);
37195 }
37196
37197 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
37198 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
37199 isa<ConstantSDNode>(N0)) {
37200 auto *C = cast<ConstantSDNode>(N0);
37201 if (C->isAllOnesValue())
37202 return DAG.getConstant(1, SDLoc(N0), VT);
37203 if (C->isNullValue())
37204 return DAG.getConstant(0, SDLoc(N0), VT);
37205 }
37206
37207 // Try to remove bitcasts from input and output of mask arithmetic to
37208 // remove GPR<->K-register crossings.
37209 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
37210 return V;
37211
37212 // Convert a bitcasted integer logic operation that has one bitcasted
37213 // floating-point operand into a floating-point logic operation. This may
37214 // create a load of a constant, but that is cheaper than materializing the
37215 // constant in an integer register and transferring it to an SSE register or
37216 // transferring the SSE operand to integer register and back.
37217 unsigned FPOpcode;
37218 switch (N0.getOpcode()) {
37219 case ISD::AND: FPOpcode = X86ISD::FAND; break;
37220 case ISD::OR: FPOpcode = X86ISD::FOR; break;
37221 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
37222 default: return SDValue();
37223 }
37224
37225 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
37226 (Subtarget.hasSSE2() && VT == MVT::f64)))
37227 return SDValue();
37228
37229 SDValue LogicOp0 = N0.getOperand(0);
37230 SDValue LogicOp1 = N0.getOperand(1);
37231 SDLoc DL0(N0);
37232
37233 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
37234 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
37235 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
37236 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
37237 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
37238 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
37239 }
37240 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
37241 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
37242 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
37243 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
37244 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
37245 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
37246 }
37247
37248 return SDValue();
37249}
37250
37251// Given a ABS node, detect the following pattern:
37252// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
37253// This is useful as it is the input into a SAD pattern.
37254static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
37255 SDValue AbsOp1 = Abs->getOperand(0);
37256 if (AbsOp1.getOpcode() != ISD::SUB)
37257 return false;
37258
37259 Op0 = AbsOp1.getOperand(0);
37260 Op1 = AbsOp1.getOperand(1);
37261
37262 // Check if the operands of the sub are zero-extended from vectors of i8.
37263 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
37264 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
37265 Op1.getOpcode() != ISD::ZERO_EXTEND ||
37266 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
37267 return false;
37268
37269 return true;
37270}
37271
37272// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
37273// to these zexts.
37274static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
37275 const SDValue &Zext1, const SDLoc &DL,
37276 const X86Subtarget &Subtarget) {
37277 // Find the appropriate width for the PSADBW.
37278 EVT InVT = Zext0.getOperand(0).getValueType();
37279 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
37280
37281 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
37282 // fill in the missing vector elements with 0.
37283 unsigned NumConcat = RegSize / InVT.getSizeInBits();
37284 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
37285 Ops[0] = Zext0.getOperand(0);
37286 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
37287 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
37288 Ops[0] = Zext1.getOperand(0);
37289 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
37290
37291 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
37292 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
37293 ArrayRef<SDValue> Ops) {
37294 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
37295 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
37296 };
37297 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
37298 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
37299 PSADBWBuilder);
37300}
37301
37302// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
37303// PHMINPOSUW.
37304static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
37305 const X86Subtarget &Subtarget) {
37306 // Bail without SSE41.
37307 if (!Subtarget.hasSSE41())
37308 return SDValue();
37309
37310 EVT ExtractVT = Extract->getValueType(0);
37311 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
37312 return SDValue();
37313
37314 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
37315 ISD::NodeType BinOp;
37316 SDValue Src = DAG.matchBinOpReduction(
37317 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
37318 if (!Src)
37319 return SDValue();
37320
37321 EVT SrcVT = Src.getValueType();
37322 EVT SrcSVT = SrcVT.getScalarType();
37323 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
37324 return SDValue();
37325
37326 SDLoc DL(Extract);
37327 SDValue MinPos = Src;
37328
37329 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
37330 while (SrcVT.getSizeInBits() > 128) {
37331 unsigned NumElts = SrcVT.getVectorNumElements();
37332 unsigned NumSubElts = NumElts / 2;
37333 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
37334 unsigned SubSizeInBits = SrcVT.getSizeInBits();
37335 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
37336 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
37337 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
37338 }
37339 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37341, __PRETTY_FUNCTION__))
37340 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37341, __PRETTY_FUNCTION__))
37341 "Unexpected value type")((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37341, __PRETTY_FUNCTION__))
;
37342
37343 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
37344 // to flip the value accordingly.
37345 SDValue Mask;
37346 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
37347 if (BinOp == ISD::SMAX)
37348 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
37349 else if (BinOp == ISD::SMIN)
37350 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
37351 else if (BinOp == ISD::UMAX)
37352 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
37353
37354 if (Mask)
37355 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
37356
37357 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
37358 // shuffling each upper element down and insert zeros. This means that the
37359 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
37360 // ready for the PHMINPOS.
37361 if (ExtractVT == MVT::i8) {
37362 SDValue Upper = DAG.getVectorShuffle(
37363 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
37364 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
37365 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
37366 }
37367
37368 // Perform the PHMINPOS on a v8i16 vector,
37369 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
37370 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
37371 MinPos = DAG.getBitcast(SrcVT, MinPos);
37372
37373 if (Mask)
37374 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
37375
37376 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
37377 DAG.getIntPtrConstant(0, DL));
37378}
37379
37380// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
37381static SDValue combineHorizontalPredicateResult(SDNode *Extract,
37382 SelectionDAG &DAG,
37383 const X86Subtarget &Subtarget) {
37384 // Bail without SSE2.
37385 if (!Subtarget.hasSSE2())
37386 return SDValue();
37387
37388 EVT ExtractVT = Extract->getValueType(0);
37389 unsigned BitWidth = ExtractVT.getSizeInBits();
37390 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
37391 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
37392 return SDValue();
37393
37394 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
37395 ISD::NodeType BinOp;
37396 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
37397 if (!Match && ExtractVT == MVT::i1)
37398 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
37399 if (!Match)
37400 return SDValue();
37401
37402 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
37403 // which we can't support here for now.
37404 if (Match.getScalarValueSizeInBits() != BitWidth)
37405 return SDValue();
37406
37407 SDValue Movmsk;
37408 SDLoc DL(Extract);
37409 EVT MatchVT = Match.getValueType();
37410 unsigned NumElts = MatchVT.getVectorNumElements();
37411 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
37412 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37413
37414 if (ExtractVT == MVT::i1) {
37415 // Special case for (pre-legalization) vXi1 reductions.
37416 if (NumElts > 64 || !isPowerOf2_32(NumElts))
37417 return SDValue();
37418 if (TLI.isTypeLegal(MatchVT)) {
37419 // If this is a legal AVX512 predicate type then we can just bitcast.
37420 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
37421 Movmsk = DAG.getBitcast(MovmskVT, Match);
37422 } else {
37423 // Use combineBitcastvxi1 to create the MOVMSK.
37424 while (NumElts > MaxElts) {
37425 SDValue Lo, Hi;
37426 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
37427 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
37428 NumElts /= 2;
37429 }
37430 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
37431 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
37432 }
37433 if (!Movmsk)
37434 return SDValue();
37435 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
37436 } else {
37437 // FIXME: Better handling of k-registers or 512-bit vectors?
37438 unsigned MatchSizeInBits = Match.getValueSizeInBits();
37439 if (!(MatchSizeInBits == 128 ||
37440 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
37441 return SDValue();
37442
37443 // Make sure this isn't a vector of 1 element. The perf win from using
37444 // MOVMSK diminishes with less elements in the reduction, but it is
37445 // generally better to get the comparison over to the GPRs as soon as
37446 // possible to reduce the number of vector ops.
37447 if (Match.getValueType().getVectorNumElements() < 2)
37448 return SDValue();
37449
37450 // Check that we are extracting a reduction of all sign bits.
37451 if (DAG.ComputeNumSignBits(Match) != BitWidth)
37452 return SDValue();
37453
37454 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
37455 SDValue Lo, Hi;
37456 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
37457 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
37458 MatchSizeInBits = Match.getValueSizeInBits();
37459 }
37460
37461 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
37462 MVT MaskSrcVT;
37463 if (64 == BitWidth || 32 == BitWidth)
37464 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
37465 MatchSizeInBits / BitWidth);
37466 else
37467 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
37468
37469 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
37470 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
37471 NumElts = MaskSrcVT.getVectorNumElements();
37472 }
37473 assert((NumElts <= 32 || NumElts == 64) &&(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37474, __PRETTY_FUNCTION__))
37474 "Not expecting more than 64 elements")(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37474, __PRETTY_FUNCTION__))
;
37475
37476 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
37477 if (BinOp == ISD::XOR) {
37478 // parity -> (AND (CTPOP(MOVMSK X)), 1)
37479 SDValue Mask = DAG.getConstant(1, DL, CmpVT);
37480 SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
37481 Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
37482 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
37483 }
37484
37485 SDValue CmpC;
37486 ISD::CondCode CondCode;
37487 if (BinOp == ISD::OR) {
37488 // any_of -> MOVMSK != 0
37489 CmpC = DAG.getConstant(0, DL, CmpVT);
37490 CondCode = ISD::CondCode::SETNE;
37491 } else {
37492 // all_of -> MOVMSK == ((1 << NumElts) - 1)
37493 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
37494 DL, CmpVT);
37495 CondCode = ISD::CondCode::SETEQ;
37496 }
37497
37498 // The setcc produces an i8 of 0/1, so extend that to the result width and
37499 // negate to get the final 0/-1 mask value.
37500 EVT SetccVT =
37501 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
37502 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
37503 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
37504 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
37505 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
37506}
37507
37508static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
37509 const X86Subtarget &Subtarget) {
37510 // PSADBW is only supported on SSE2 and up.
37511 if (!Subtarget.hasSSE2())
37512 return SDValue();
37513
37514 // Verify the type we're extracting from is any integer type above i16.
37515 EVT VT = Extract->getOperand(0).getValueType();
37516 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
37517 return SDValue();
37518
37519 unsigned RegSize = 128;
37520 if (Subtarget.useBWIRegs())
37521 RegSize = 512;
37522 else if (Subtarget.hasAVX())
37523 RegSize = 256;
37524
37525 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
37526 // TODO: We should be able to handle larger vectors by splitting them before
37527 // feeding them into several SADs, and then reducing over those.
37528 if (RegSize / VT.getVectorNumElements() < 8)
37529 return SDValue();
37530
37531 // Match shuffle + add pyramid.
37532 ISD::NodeType BinOp;
37533 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
37534
37535 // The operand is expected to be zero extended from i8
37536 // (verified in detectZextAbsDiff).
37537 // In order to convert to i64 and above, additional any/zero/sign
37538 // extend is expected.
37539 // The zero extend from 32 bit has no mathematical effect on the result.
37540 // Also the sign extend is basically zero extend
37541 // (extends the sign bit which is zero).
37542 // So it is correct to skip the sign/zero extend instruction.
37543 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
37544 Root.getOpcode() == ISD::ZERO_EXTEND ||
37545 Root.getOpcode() == ISD::ANY_EXTEND))
37546 Root = Root.getOperand(0);
37547
37548 // If there was a match, we want Root to be a select that is the root of an
37549 // abs-diff pattern.
37550 if (!Root || Root.getOpcode() != ISD::ABS)
37551 return SDValue();
37552
37553 // Check whether we have an abs-diff pattern feeding into the select.
37554 SDValue Zext0, Zext1;
37555 if (!detectZextAbsDiff(Root, Zext0, Zext1))
37556 return SDValue();
37557
37558 // Create the SAD instruction.
37559 SDLoc DL(Extract);
37560 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
37561
37562 // If the original vector was wider than 8 elements, sum over the results
37563 // in the SAD vector.
37564 unsigned Stages = Log2_32(VT.getVectorNumElements());
37565 MVT SadVT = SAD.getSimpleValueType();
37566 if (Stages > 3) {
37567 unsigned SadElems = SadVT.getVectorNumElements();
37568
37569 for(unsigned i = Stages - 3; i > 0; --i) {
37570 SmallVector<int, 16> Mask(SadElems, -1);
37571 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
37572 Mask[j] = MaskEnd + j;
37573
37574 SDValue Shuffle =
37575 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
37576 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
37577 }
37578 }
37579
37580 MVT Type = Extract->getSimpleValueType(0);
37581 unsigned TypeSizeInBits = Type.getSizeInBits();
37582 // Return the lowest TypeSizeInBits bits.
37583 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
37584 SAD = DAG.getBitcast(ResVT, SAD);
37585 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
37586 Extract->getOperand(1));
37587}
37588
37589// Attempt to peek through a target shuffle and extract the scalar from the
37590// source.
37591static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
37592 TargetLowering::DAGCombinerInfo &DCI,
37593 const X86Subtarget &Subtarget) {
37594 if (DCI.isBeforeLegalizeOps())
37595 return SDValue();
37596
37597 SDLoc dl(N);
37598 SDValue Src = N->getOperand(0);
37599 SDValue Idx = N->getOperand(1);
37600
37601 EVT VT = N->getValueType(0);
37602 EVT SrcVT = Src.getValueType();
37603 EVT SrcSVT = SrcVT.getVectorElementType();
37604 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37605
37606 // Don't attempt this for boolean mask vectors or unknown extraction indices.
37607 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
37608 return SDValue();
37609
37610 const APInt &IdxC = N->getConstantOperandAPInt(1);
37611 if (IdxC.uge(NumSrcElts))
37612 return SDValue();
37613
37614 SDValue SrcBC = peekThroughBitcasts(Src);
37615
37616 // Handle extract(bitcast(broadcast(scalar_value))).
37617 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
37618 SDValue SrcOp = SrcBC.getOperand(0);
37619 if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
37620 return DAG.getBitcast(VT, SrcOp);
37621
37622 EVT SrcOpVT = SrcOp.getValueType();
37623 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
37624 (SrcOpVT.getSizeInBits() % SrcSVT.getSizeInBits()) == 0) {
37625 unsigned Scale = SrcOpVT.getSizeInBits() / SrcSVT.getSizeInBits();
37626 unsigned Offset = IdxC.urem(Scale) * SrcSVT.getSizeInBits();
37627 // TODO support non-zero offsets.
37628 if (Offset == 0) {
37629 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
37630 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
37631 return SrcOp;
37632 }
37633 }
37634 }
37635
37636 // If we're extracting a single element from a broadcast load and there are
37637 // no other users, just create a single load.
37638 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
37639 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
37640 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
37641 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
37642 VT.getSizeInBits() == SrcBCWidth) {
37643 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
37644 MemIntr->getBasePtr(),
37645 MemIntr->getPointerInfo(),
37646 MemIntr->getAlignment(),
37647 MemIntr->getMemOperand()->getFlags());
37648 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
37649 return Load;
37650 }
37651 }
37652
37653 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
37654 // TODO: Move to DAGCombine?
37655 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
37656 SrcBC.getValueType().isInteger() &&
37657 (SrcBC.getScalarValueSizeInBits() % SrcSVT.getSizeInBits()) == 0 &&
37658 SrcBC.getScalarValueSizeInBits() ==
37659 SrcBC.getOperand(0).getValueSizeInBits()) {
37660 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcSVT.getSizeInBits();
37661 if (IdxC.ult(Scale)) {
37662 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
37663 SDValue Scl = SrcBC.getOperand(0);
37664 EVT SclVT = Scl.getValueType();
37665 if (Offset) {
37666 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
37667 DAG.getShiftAmountConstant(Offset, SclVT, dl));
37668 }
37669 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
37670 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
37671 return Scl;
37672 }
37673 }
37674
37675 // Handle extract(truncate(x)) for 0'th index.
37676 // TODO: Treat this as a faux shuffle?
37677 // TODO: When can we use this for general indices?
37678 if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
37679 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
37680 Src = DAG.getBitcast(SrcVT, Src);
37681 return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
37682 }
37683
37684 // Resolve the target shuffle inputs and mask.
37685 SmallVector<int, 16> Mask;
37686 SmallVector<SDValue, 2> Ops;
37687 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
37688 return SDValue();
37689
37690 // Attempt to narrow/widen the shuffle mask to the correct size.
37691 if (Mask.size() != NumSrcElts) {
37692 if ((NumSrcElts % Mask.size()) == 0) {
37693 SmallVector<int, 16> ScaledMask;
37694 int Scale = NumSrcElts / Mask.size();
37695 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
37696 Mask = std::move(ScaledMask);
37697 } else if ((Mask.size() % NumSrcElts) == 0) {
37698 // Simplify Mask based on demanded element.
37699 int ExtractIdx = (int)N->getConstantOperandVal(1);
37700 int Scale = Mask.size() / NumSrcElts;
37701 int Lo = Scale * ExtractIdx;
37702 int Hi = Scale * (ExtractIdx + 1);
37703 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
37704 if (i < Lo || Hi <= i)
37705 Mask[i] = SM_SentinelUndef;
37706
37707 SmallVector<int, 16> WidenedMask;
37708 while (Mask.size() > NumSrcElts &&
37709 canWidenShuffleElements(Mask, WidenedMask))
37710 Mask = std::move(WidenedMask);
37711 // TODO - investigate support for wider shuffle masks with known upper
37712 // undef/zero elements for implicit zero-extension.
37713 }
37714 }
37715
37716 // Check if narrowing/widening failed.
37717 if (Mask.size() != NumSrcElts)
37718 return SDValue();
37719
37720 int SrcIdx = Mask[IdxC.getZExtValue()];
37721
37722 // If the shuffle source element is undef/zero then we can just accept it.
37723 if (SrcIdx == SM_SentinelUndef)
37724 return DAG.getUNDEF(VT);
37725
37726 if (SrcIdx == SM_SentinelZero)
37727 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
37728 : DAG.getConstant(0, dl, VT);
37729
37730 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
37731 SrcIdx = SrcIdx % Mask.size();
37732
37733 // We can only extract other elements from 128-bit vectors and in certain
37734 // circumstances, depending on SSE-level.
37735 // TODO: Investigate using extract_subvector for larger vectors.
37736 // TODO: Investigate float/double extraction if it will be just stored.
37737 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
37738 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
37739 assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast
<void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37739, __PRETTY_FUNCTION__))
;
37740 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
37741 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
37742 DAG.getIntPtrConstant(SrcIdx, dl));
37743 }
37744
37745 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
37746 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
37747 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37748, __PRETTY_FUNCTION__))
37748 "Unexpected extraction type")((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37748, __PRETTY_FUNCTION__))
;
37749 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
37750 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
37751 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
37752 DAG.getIntPtrConstant(SrcIdx, dl));
37753 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
37754 }
37755
37756 return SDValue();
37757}
37758
37759/// Extracting a scalar FP value from vector element 0 is free, so extract each
37760/// operand first, then perform the math as a scalar op.
37761static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
37762 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Expected extract") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37762, __PRETTY_FUNCTION__))
;
37763 SDValue Vec = ExtElt->getOperand(0);
37764 SDValue Index = ExtElt->getOperand(1);
37765 EVT VT = ExtElt->getValueType(0);
37766 EVT VecVT = Vec.getValueType();
37767
37768 // TODO: If this is a unary/expensive/expand op, allow extraction from a
37769 // non-zero element because the shuffle+scalar op will be cheaper?
37770 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
37771 return SDValue();
37772
37773 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
37774 // extract, the condition code), so deal with those as a special-case.
37775 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
37776 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
37777 if (OpVT != MVT::f32 && OpVT != MVT::f64)
37778 return SDValue();
37779
37780 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
37781 SDLoc DL(ExtElt);
37782 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
37783 Vec.getOperand(0), Index);
37784 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
37785 Vec.getOperand(1), Index);
37786 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
37787 }
37788
37789 if (VT != MVT::f32 && VT != MVT::f64)
37790 return SDValue();
37791
37792 // Vector FP selects don't fit the pattern of FP math ops (because the
37793 // condition has a different type and we have to change the opcode), so deal
37794 // with those here.
37795 // FIXME: This is restricted to pre type legalization by ensuring the setcc
37796 // has i1 elements. If we loosen this we need to convert vector bool to a
37797 // scalar bool.
37798 if (Vec.getOpcode() == ISD::VSELECT &&
37799 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
37800 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
37801 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
37802 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
37803 SDLoc DL(ExtElt);
37804 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
37805 Vec.getOperand(0).getValueType().getScalarType(),
37806 Vec.getOperand(0), Index);
37807 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
37808 Vec.getOperand(1), Index);
37809 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
37810 Vec.getOperand(2), Index);
37811 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
37812 }
37813
37814 // TODO: This switch could include FNEG and the x86-specific FP logic ops
37815 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
37816 // missed load folding and fma+fneg combining.
37817 switch (Vec.getOpcode()) {
37818 case ISD::FMA: // Begin 3 operands
37819 case ISD::FMAD:
37820 case ISD::FADD: // Begin 2 operands
37821 case ISD::FSUB:
37822 case ISD::FMUL:
37823 case ISD::FDIV:
37824 case ISD::FREM:
37825 case ISD::FCOPYSIGN:
37826 case ISD::FMINNUM:
37827 case ISD::FMAXNUM:
37828 case ISD::FMINNUM_IEEE:
37829 case ISD::FMAXNUM_IEEE:
37830 case ISD::FMAXIMUM:
37831 case ISD::FMINIMUM:
37832 case X86ISD::FMAX:
37833 case X86ISD::FMIN:
37834 case ISD::FABS: // Begin 1 operand
37835 case ISD::FSQRT:
37836 case ISD::FRINT:
37837 case ISD::FCEIL:
37838 case ISD::FTRUNC:
37839 case ISD::FNEARBYINT:
37840 case ISD::FROUND:
37841 case ISD::FFLOOR:
37842 case X86ISD::FRCP:
37843 case X86ISD::FRSQRT: {
37844 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
37845 SDLoc DL(ExtElt);
37846 SmallVector<SDValue, 4> ExtOps;
37847 for (SDValue Op : Vec->ops())
37848 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
37849 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
37850 }
37851 default:
37852 return SDValue();
37853 }
37854 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37854)
;
37855}
37856
37857/// Try to convert a vector reduction sequence composed of binops and shuffles
37858/// into horizontal ops.
37859static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
37860 const X86Subtarget &Subtarget) {
37861 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Unexpected caller") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37861, __PRETTY_FUNCTION__))
;
37862
37863 // We need at least SSE2 to anything here.
37864 if (!Subtarget.hasSSE2())
37865 return SDValue();
37866
37867 ISD::NodeType Opc;
37868 SDValue Rdx =
37869 DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
37870 if (!Rdx)
37871 return SDValue();
37872
37873 SDValue Index = ExtElt->getOperand(1);
37874 assert(isNullConstant(Index) &&((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37875, __PRETTY_FUNCTION__))
37875 "Reduction doesn't end in an extract from index 0")((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37875, __PRETTY_FUNCTION__))
;
37876
37877 EVT VT = ExtElt->getValueType(0);
37878 EVT VecVT = Rdx.getValueType();
37879 if (VecVT.getScalarType() != VT)
37880 return SDValue();
37881
37882 SDLoc DL(ExtElt);
37883
37884 // vXi8 reduction - sub 128-bit vector.
37885 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
37886 if (VecVT == MVT::v4i8) {
37887 // Pad with zero.
37888 if (Subtarget.hasSSE41()) {
37889 Rdx = DAG.getBitcast(MVT::i32, Rdx);
37890 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
37891 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
37892 DAG.getIntPtrConstant(0, DL));
37893 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
37894 } else {
37895 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
37896 DAG.getConstant(0, DL, VecVT));
37897 }
37898 }
37899 if (Rdx.getValueType() == MVT::v8i8) {
37900 // Pad with undef.
37901 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
37902 DAG.getUNDEF(MVT::v8i8));
37903 }
37904 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
37905 DAG.getConstant(0, DL, MVT::v16i8));
37906 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
37907 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
37908 }
37909
37910 // Must be a >=128-bit vector with pow2 elements.
37911 if ((VecVT.getSizeInBits() % 128) != 0 ||
37912 !isPowerOf2_32(VecVT.getVectorNumElements()))
37913 return SDValue();
37914
37915 // vXi8 reduction - sum lo/hi halves then use PSADBW.
37916 if (VT == MVT::i8) {
37917 while (Rdx.getValueSizeInBits() > 128) {
37918 unsigned HalfSize = VecVT.getSizeInBits() / 2;
37919 unsigned HalfElts = VecVT.getVectorNumElements() / 2;
37920 SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
37921 SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
37922 Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
37923 VecVT = Rdx.getValueType();
37924 }
37925 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((VecVT == MVT::v16i8 && "v16i8 reduction expected") ?
static_cast<void> (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37925, __PRETTY_FUNCTION__))
;
37926
37927 SDValue Hi = DAG.getVectorShuffle(
37928 MVT::v16i8, DL, Rdx, Rdx,
37929 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
37930 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
37931 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
37932 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
37933 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
37934 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
37935 }
37936
37937 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
37938 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
37939 if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
37940 return SDValue();
37941
37942 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
37943
37944 // 256-bit horizontal instructions operate on 128-bit chunks rather than
37945 // across the whole vector, so we need an extract + hop preliminary stage.
37946 // This is the only step where the operands of the hop are not the same value.
37947 // TODO: We could extend this to handle 512-bit or even longer vectors.
37948 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
37949 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
37950 unsigned NumElts = VecVT.getVectorNumElements();
37951 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
37952 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
37953 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
37954 VecVT = Rdx.getValueType();
37955 }
37956 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
37957 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
37958 return SDValue();
37959
37960 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
37961 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
37962 for (unsigned i = 0; i != ReductionSteps; ++i)
37963 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
37964
37965 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
37966}
37967
37968/// Detect vector gather/scatter index generation and convert it from being a
37969/// bunch of shuffles and extracts into a somewhat faster sequence.
37970/// For i686, the best sequence is apparently storing the value and loading
37971/// scalars back, while for x64 we should use 64-bit extracts and shifts.
37972static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
37973 TargetLowering::DAGCombinerInfo &DCI,
37974 const X86Subtarget &Subtarget) {
37975 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
37976 return NewOp;
37977
37978 SDValue InputVector = N->getOperand(0);
37979 SDValue EltIdx = N->getOperand(1);
37980 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
37981
37982 EVT SrcVT = InputVector.getValueType();
37983 EVT VT = N->getValueType(0);
37984 SDLoc dl(InputVector);
37985 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
37986 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37987
37988 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
37989 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
37990
37991 // Integer Constant Folding.
37992 if (CIdx && VT.isInteger()) {
37993 APInt UndefVecElts;
37994 SmallVector<APInt, 16> EltBits;
37995 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
37996 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
37997 EltBits, true, false)) {
37998 uint64_t Idx = CIdx->getZExtValue();
37999 if (UndefVecElts[Idx])
38000 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
38001 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
38002 dl, VT);
38003 }
38004 }
38005
38006 if (IsPextr) {
38007 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38008 if (TLI.SimplifyDemandedBits(
38009 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
38010 return SDValue(N, 0);
38011
38012 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
38013 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
38014 InputVector.getOpcode() == X86ISD::PINSRW) &&
38015 InputVector.getOperand(2) == EltIdx) {
38016 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38017, __PRETTY_FUNCTION__))
38017 "Vector type mismatch")((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38017, __PRETTY_FUNCTION__))
;
38018 SDValue Scl = InputVector.getOperand(1);
38019 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
38020 return DAG.getZExtOrTrunc(Scl, dl, VT);
38021 }
38022
38023 // TODO - Remove this once we can handle the implicit zero-extension of
38024 // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
38025 // combineBasicSADPattern.
38026 return SDValue();
38027 }
38028
38029 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
38030 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
38031 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
38032 SDValue MMXSrc = InputVector.getOperand(0);
38033
38034 // The bitcast source is a direct mmx result.
38035 if (MMXSrc.getValueType() == MVT::x86mmx)
38036 return DAG.getBitcast(VT, InputVector);
38037 }
38038
38039 // Detect mmx to i32 conversion through a v2i32 elt extract.
38040 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
38041 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
38042 SDValue MMXSrc = InputVector.getOperand(0);
38043
38044 // The bitcast source is a direct mmx result.
38045 if (MMXSrc.getValueType() == MVT::x86mmx)
38046 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
38047 }
38048
38049 // Check whether this extract is the root of a sum of absolute differences
38050 // pattern. This has to be done here because we really want it to happen
38051 // pre-legalization,
38052 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
38053 return SAD;
38054
38055 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
38056 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
38057 return Cmp;
38058
38059 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
38060 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
38061 return MinMax;
38062
38063 if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
38064 return V;
38065
38066 if (SDValue V = scalarizeExtEltFP(N, DAG))
38067 return V;
38068
38069 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
38070 // and then testing the relevant element.
38071 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
38072 SmallVector<SDNode *, 16> BoolExtracts;
38073 auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
38074 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38075 isa<ConstantSDNode>(Use->getOperand(1)) &&
38076 Use->getValueType(0) == MVT::i1) {
38077 BoolExtracts.push_back(Use);
38078 return true;
38079 }
38080 return false;
38081 };
38082 if (all_of(InputVector->uses(), IsBoolExtract) &&
38083 BoolExtracts.size() > 1) {
38084 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
38085 if (SDValue BC =
38086 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
38087 for (SDNode *Use : BoolExtracts) {
38088 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
38089 unsigned MaskIdx = Use->getConstantOperandVal(1);
38090 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
38091 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
38092 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
38093 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
38094 DCI.CombineTo(Use, Res);
38095 }
38096 return SDValue(N, 0);
38097 }
38098 }
38099 }
38100
38101 return SDValue();
38102}
38103
38104/// If a vector select has an operand that is -1 or 0, try to simplify the
38105/// select to a bitwise logic operation.
38106/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
38107static SDValue
38108combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
38109 TargetLowering::DAGCombinerInfo &DCI,
38110 const X86Subtarget &Subtarget) {
38111 SDValue Cond = N->getOperand(0);
38112 SDValue LHS = N->getOperand(1);
38113 SDValue RHS = N->getOperand(2);
38114 EVT VT = LHS.getValueType();
38115 EVT CondVT = Cond.getValueType();
38116 SDLoc DL(N);
38117 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38118
38119 if (N->getOpcode() != ISD::VSELECT)
38120 return SDValue();
38121
38122 assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38122, __PRETTY_FUNCTION__))
;
38123
38124 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
38125 // TODO: Can we assert that both operands are not zeros (because that should
38126 // get simplified at node creation time)?
38127 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
38128 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
38129
38130 // If both inputs are 0/undef, create a complete zero vector.
38131 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
38132 if (TValIsAllZeros && FValIsAllZeros) {
38133 if (VT.isFloatingPoint())
38134 return DAG.getConstantFP(0.0, DL, VT);
38135 return DAG.getConstant(0, DL, VT);
38136 }
38137
38138 // To use the condition operand as a bitwise mask, it must have elements that
38139 // are the same size as the select elements. Ie, the condition operand must
38140 // have already been promoted from the IR select condition type <N x i1>.
38141 // Don't check if the types themselves are equal because that excludes
38142 // vector floating-point selects.
38143 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
38144 return SDValue();
38145
38146 // Try to invert the condition if true value is not all 1s and false value is
38147 // not all 0s. Only do this if the condition has one use.
38148 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
38149 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
38150 // Check if the selector will be produced by CMPP*/PCMP*.
38151 Cond.getOpcode() == ISD::SETCC &&
38152 // Check if SETCC has already been promoted.
38153 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
38154 CondVT) {
38155 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
38156
38157 if (TValIsAllZeros || FValIsAllOnes) {
38158 SDValue CC = Cond.getOperand(2);
38159 ISD::CondCode NewCC = ISD::getSetCCInverse(
38160 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
38161 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
38162 NewCC);
38163 std::swap(LHS, RHS);
38164 TValIsAllOnes = FValIsAllOnes;
38165 FValIsAllZeros = TValIsAllZeros;
38166 }
38167 }
38168
38169 // Cond value must be 'sign splat' to be converted to a logical op.
38170 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
38171 return SDValue();
38172
38173 // vselect Cond, 111..., 000... -> Cond
38174 if (TValIsAllOnes && FValIsAllZeros)
38175 return DAG.getBitcast(VT, Cond);
38176
38177 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
38178 return SDValue();
38179
38180 // vselect Cond, 111..., X -> or Cond, X
38181 if (TValIsAllOnes) {
38182 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
38183 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
38184 return DAG.getBitcast(VT, Or);
38185 }
38186
38187 // vselect Cond, X, 000... -> and Cond, X
38188 if (FValIsAllZeros) {
38189 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
38190 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
38191 return DAG.getBitcast(VT, And);
38192 }
38193
38194 // vselect Cond, 000..., X -> andn Cond, X
38195 if (TValIsAllZeros) {
38196 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
38197 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
38198 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
38199 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
38200 return DAG.getBitcast(VT, AndN);
38201 }
38202
38203 return SDValue();
38204}
38205
38206/// If both arms of a vector select are concatenated vectors, split the select,
38207/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
38208/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
38209/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
38210static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
38211 const X86Subtarget &Subtarget) {
38212 unsigned Opcode = N->getOpcode();
38213 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
38214 return SDValue();
38215
38216 // TODO: Split 512-bit vectors too?
38217 EVT VT = N->getValueType(0);
38218 if (!VT.is256BitVector())
38219 return SDValue();
38220
38221 // TODO: Split as long as any 2 of the 3 operands are concatenated?
38222 SDValue Cond = N->getOperand(0);
38223 SDValue TVal = N->getOperand(1);
38224 SDValue FVal = N->getOperand(2);
38225 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
38226 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
38227 !collectConcatOps(TVal.getNode(), CatOpsT) ||
38228 !collectConcatOps(FVal.getNode(), CatOpsF))
38229 return SDValue();
38230
38231 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
38232 ArrayRef<SDValue> Ops) {
38233 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
38234 };
38235 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
38236 makeBlend, /*CheckBWI*/ false);
38237}
38238
38239static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
38240 SDValue Cond = N->getOperand(0);
38241 SDValue LHS = N->getOperand(1);
38242 SDValue RHS = N->getOperand(2);
38243 SDLoc DL(N);
38244
38245 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
38246 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
38247 if (!TrueC || !FalseC)
38248 return SDValue();
38249
38250 // Don't do this for crazy integer types.
38251 EVT VT = N->getValueType(0);
38252 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
38253 return SDValue();
38254
38255 // We're going to use the condition bit in math or logic ops. We could allow
38256 // this with a wider condition value (post-legalization it becomes an i8),
38257 // but if nothing is creating selects that late, it doesn't matter.
38258 if (Cond.getValueType() != MVT::i1)
38259 return SDValue();
38260
38261 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
38262 // 3, 5, or 9 with i32/i64, so those get transformed too.
38263 // TODO: For constants that overflow or do not differ by power-of-2 or small
38264 // multiplier, convert to 'and' + 'add'.
38265 const APInt &TrueVal = TrueC->getAPIntValue();
38266 const APInt &FalseVal = FalseC->getAPIntValue();
38267 bool OV;
38268 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
38269 if (OV)
38270 return SDValue();
38271
38272 APInt AbsDiff = Diff.abs();
38273 if (AbsDiff.isPowerOf2() ||
38274 ((VT == MVT::i32 || VT == MVT::i64) &&
38275 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
38276
38277 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
38278 // of the condition can usually be folded into a compare predicate, but even
38279 // without that, the sequence should be cheaper than a CMOV alternative.
38280 if (TrueVal.slt(FalseVal)) {
38281 Cond = DAG.getNOT(DL, Cond, MVT::i1);
38282 std::swap(TrueC, FalseC);
38283 }
38284
38285 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
38286 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
38287
38288 // Multiply condition by the difference if non-one.
38289 if (!AbsDiff.isOneValue())
38290 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
38291
38292 // Add the base if non-zero.
38293 if (!FalseC->isNullValue())
38294 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
38295
38296 return R;
38297 }
38298
38299 return SDValue();
38300}
38301
38302/// If this is a *dynamic* select (non-constant condition) and we can match
38303/// this node with one of the variable blend instructions, restructure the
38304/// condition so that blends can use the high (sign) bit of each element.
38305/// This function will also call SimplifyDemandedBits on already created
38306/// BLENDV to perform additional simplifications.
38307static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
38308 TargetLowering::DAGCombinerInfo &DCI,
38309 const X86Subtarget &Subtarget) {
38310 SDValue Cond = N->getOperand(0);
38311 if ((N->getOpcode() != ISD::VSELECT &&
38312 N->getOpcode() != X86ISD::BLENDV) ||
38313 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
38314 return SDValue();
38315
38316 // Don't optimize before the condition has been transformed to a legal type
38317 // and don't ever optimize vector selects that map to AVX512 mask-registers.
38318 unsigned BitWidth = Cond.getScalarValueSizeInBits();
38319 if (BitWidth < 8 || BitWidth > 64)
38320 return SDValue();
38321
38322 // We can only handle the cases where VSELECT is directly legal on the
38323 // subtarget. We custom lower VSELECT nodes with constant conditions and
38324 // this makes it hard to see whether a dynamic VSELECT will correctly
38325 // lower, so we both check the operation's status and explicitly handle the
38326 // cases where a *dynamic* blend will fail even though a constant-condition
38327 // blend could be custom lowered.
38328 // FIXME: We should find a better way to handle this class of problems.
38329 // Potentially, we should combine constant-condition vselect nodes
38330 // pre-legalization into shuffles and not mark as many types as custom
38331 // lowered.
38332 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38333 EVT VT = N->getValueType(0);
38334 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
38335 return SDValue();
38336 // FIXME: We don't support i16-element blends currently. We could and
38337 // should support them by making *all* the bits in the condition be set
38338 // rather than just the high bit and using an i8-element blend.
38339 if (VT.getVectorElementType() == MVT::i16)
38340 return SDValue();
38341 // Dynamic blending was only available from SSE4.1 onward.
38342 if (VT.is128BitVector() && !Subtarget.hasSSE41())
38343 return SDValue();
38344 // Byte blends are only available in AVX2
38345 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
38346 return SDValue();
38347 // There are no 512-bit blend instructions that use sign bits.
38348 if (VT.is512BitVector())
38349 return SDValue();
38350
38351 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
38352 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
38353 UI != UE; ++UI)
38354 if ((UI->getOpcode() != ISD::VSELECT &&
38355 UI->getOpcode() != X86ISD::BLENDV) ||
38356 UI.getOperandNo() != 0)
38357 return false;
38358
38359 return true;
38360 };
38361
38362 if (OnlyUsedAsSelectCond(Cond)) {
38363 APInt DemandedMask(APInt::getSignMask(BitWidth));
38364 KnownBits Known;
38365 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38366 !DCI.isBeforeLegalizeOps());
38367 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
38368 return SDValue();
38369
38370 // If we changed the computation somewhere in the DAG, this change will
38371 // affect all users of Cond. Update all the nodes so that we do not use
38372 // the generic VSELECT anymore. Otherwise, we may perform wrong
38373 // optimizations as we messed with the actual expectation for the vector
38374 // boolean values.
38375 for (SDNode *U : Cond->uses()) {
38376 if (U->getOpcode() == X86ISD::BLENDV)
38377 continue;
38378
38379 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
38380 Cond, U->getOperand(1), U->getOperand(2));
38381 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
38382 DCI.AddToWorklist(U);
38383 }
38384 DCI.CommitTargetLoweringOpt(TLO);
38385 return SDValue(N, 0);
38386 }
38387
38388 // Otherwise we can still at least try to simplify multiple use bits.
38389 APInt DemandedBits(APInt::getSignMask(BitWidth));
38390 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
38391 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
38392 N->getOperand(1), N->getOperand(2));
38393
38394 return SDValue();
38395}
38396
38397// Try to match:
38398// (or (and (M, (sub 0, X)), (pandn M, X)))
38399// which is a special case of:
38400// (select M, (sub 0, X), X)
38401// Per:
38402// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
38403// We know that, if fNegate is 0 or 1:
38404// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
38405//
38406// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
38407// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
38408// ( M ? -X : X) == ((X ^ M ) + (M & 1))
38409// This lets us transform our vselect to:
38410// (add (xor X, M), (and M, 1))
38411// And further to:
38412// (sub (xor X, M), M)
38413static SDValue combineLogicBlendIntoConditionalNegate(
38414 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
38415 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
38416 EVT MaskVT = Mask.getValueType();
38417 assert(MaskVT.isInteger() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38419, __PRETTY_FUNCTION__))
38418 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38419, __PRETTY_FUNCTION__))
38419 "Mask must be zero/all-bits")((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38419, __PRETTY_FUNCTION__))
;
38420
38421 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
38422 return SDValue();
38423 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
38424 return SDValue();
38425
38426 auto IsNegV = [](SDNode *N, SDValue V) {
38427 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
38428 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
38429 };
38430
38431 SDValue V;
38432 if (IsNegV(Y.getNode(), X))
38433 V = X;
38434 else if (IsNegV(X.getNode(), Y))
38435 V = Y;
38436 else
38437 return SDValue();
38438
38439 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
38440 SDValue SubOp2 = Mask;
38441
38442 // If the negate was on the false side of the select, then
38443 // the operands of the SUB need to be swapped. PR 27251.
38444 // This is because the pattern being matched above is
38445 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
38446 // but if the pattern matched was
38447 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
38448 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
38449 // pattern also needs to be a negation of the replacement pattern above.
38450 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
38451 // sub accomplishes the negation of the replacement pattern.
38452 if (V == Y)
38453 std::swap(SubOp1, SubOp2);
38454
38455 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
38456 return DAG.getBitcast(VT, Res);
38457}
38458
38459/// Do target-specific dag combines on SELECT and VSELECT nodes.
38460static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
38461 TargetLowering::DAGCombinerInfo &DCI,
38462 const X86Subtarget &Subtarget) {
38463 SDLoc DL(N);
38464 SDValue Cond = N->getOperand(0);
38465 SDValue LHS = N->getOperand(1);
38466 SDValue RHS = N->getOperand(2);
38467
38468 // Try simplification again because we use this function to optimize
38469 // BLENDV nodes that are not handled by the generic combiner.
38470 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
38471 return V;
38472
38473 EVT VT = LHS.getValueType();
38474 EVT CondVT = Cond.getValueType();
38475 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38476 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
38477
38478 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
38479 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
38480 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
38481 if (CondVT.isVector() && CondVT.isInteger() &&
38482 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
38483 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
38484 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
38485 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
38486 DL, DAG, Subtarget))
38487 return V;
38488
38489 // Convert vselects with constant condition into shuffles.
38490 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
38491 SmallVector<int, 64> Mask;
38492 if (createShuffleMaskFromVSELECT(Mask, Cond))
38493 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
38494 }
38495
38496 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
38497 // instructions match the semantics of the common C idiom x<y?x:y but not
38498 // x<=y?x:y, because of how they handle negative zero (which can be
38499 // ignored in unsafe-math mode).
38500 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
38501 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
38502 VT != MVT::f80 && VT != MVT::f128 &&
38503 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
38504 (Subtarget.hasSSE2() ||
38505 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
38506 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38507
38508 unsigned Opcode = 0;
38509 // Check for x CC y ? x : y.
38510 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
38511 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
38512 switch (CC) {
38513 default: break;
38514 case ISD::SETULT:
38515 // Converting this to a min would handle NaNs incorrectly, and swapping
38516 // the operands would cause it to handle comparisons between positive
38517 // and negative zero incorrectly.
38518 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
38519 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38520 !(DAG.isKnownNeverZeroFloat(LHS) ||
38521 DAG.isKnownNeverZeroFloat(RHS)))
38522 break;
38523 std::swap(LHS, RHS);
38524 }
38525 Opcode = X86ISD::FMIN;
38526 break;
38527 case ISD::SETOLE:
38528 // Converting this to a min would handle comparisons between positive
38529 // and negative zero incorrectly.
38530 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38531 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
38532 break;
38533 Opcode = X86ISD::FMIN;
38534 break;
38535 case ISD::SETULE:
38536 // Converting this to a min would handle both negative zeros and NaNs
38537 // incorrectly, but we can swap the operands to fix both.
38538 std::swap(LHS, RHS);
38539 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38540 case ISD::SETOLT:
38541 case ISD::SETLT:
38542 case ISD::SETLE:
38543 Opcode = X86ISD::FMIN;
38544 break;
38545
38546 case ISD::SETOGE:
38547 // Converting this to a max would handle comparisons between positive
38548 // and negative zero incorrectly.
38549 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38550 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
38551 break;
38552 Opcode = X86ISD::FMAX;
38553 break;
38554 case ISD::SETUGT:
38555 // Converting this to a max would handle NaNs incorrectly, and swapping
38556 // the operands would cause it to handle comparisons between positive
38557 // and negative zero incorrectly.
38558 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
38559 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38560 !(DAG.isKnownNeverZeroFloat(LHS) ||
38561 DAG.isKnownNeverZeroFloat(RHS)))
38562 break;
38563 std::swap(LHS, RHS);
38564 }
38565 Opcode = X86ISD::FMAX;
38566 break;
38567 case ISD::SETUGE:
38568 // Converting this to a max would handle both negative zeros and NaNs
38569 // incorrectly, but we can swap the operands to fix both.
38570 std::swap(LHS, RHS);
38571 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38572 case ISD::SETOGT:
38573 case ISD::SETGT:
38574 case ISD::SETGE:
38575 Opcode = X86ISD::FMAX;
38576 break;
38577 }
38578 // Check for x CC y ? y : x -- a min/max with reversed arms.
38579 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
38580 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
38581 switch (CC) {
38582 default: break;
38583 case ISD::SETOGE:
38584 // Converting this to a min would handle comparisons between positive
38585 // and negative zero incorrectly, and swapping the operands would
38586 // cause it to handle NaNs incorrectly.
38587 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38588 !(DAG.isKnownNeverZeroFloat(LHS) ||
38589 DAG.isKnownNeverZeroFloat(RHS))) {
38590 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38591 break;
38592 std::swap(LHS, RHS);
38593 }
38594 Opcode = X86ISD::FMIN;
38595 break;
38596 case ISD::SETUGT:
38597 // Converting this to a min would handle NaNs incorrectly.
38598 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38599 break;
38600 Opcode = X86ISD::FMIN;
38601 break;
38602 case ISD::SETUGE:
38603 // Converting this to a min would handle both negative zeros and NaNs
38604 // incorrectly, but we can swap the operands to fix both.
38605 std::swap(LHS, RHS);
38606 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38607 case ISD::SETOGT:
38608 case ISD::SETGT:
38609 case ISD::SETGE:
38610 Opcode = X86ISD::FMIN;
38611 break;
38612
38613 case ISD::SETULT:
38614 // Converting this to a max would handle NaNs incorrectly.
38615 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38616 break;
38617 Opcode = X86ISD::FMAX;
38618 break;
38619 case ISD::SETOLE:
38620 // Converting this to a max would handle comparisons between positive
38621 // and negative zero incorrectly, and swapping the operands would
38622 // cause it to handle NaNs incorrectly.
38623 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38624 !DAG.isKnownNeverZeroFloat(LHS) &&
38625 !DAG.isKnownNeverZeroFloat(RHS)) {
38626 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38627 break;
38628 std::swap(LHS, RHS);
38629 }
38630 Opcode = X86ISD::FMAX;
38631 break;
38632 case ISD::SETULE:
38633 // Converting this to a max would handle both negative zeros and NaNs
38634 // incorrectly, but we can swap the operands to fix both.
38635 std::swap(LHS, RHS);
38636 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38637 case ISD::SETOLT:
38638 case ISD::SETLT:
38639 case ISD::SETLE:
38640 Opcode = X86ISD::FMAX;
38641 break;
38642 }
38643 }
38644
38645 if (Opcode)
38646 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
38647 }
38648
38649 // Some mask scalar intrinsics rely on checking if only one bit is set
38650 // and implement it in C code like this:
38651 // A[0] = (U & 1) ? A[0] : W[0];
38652 // This creates some redundant instructions that break pattern matching.
38653 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
38654 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
38655 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
38656 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38657 SDValue AndNode = Cond.getOperand(0);
38658 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
38659 isNullConstant(Cond.getOperand(1)) &&
38660 isOneConstant(AndNode.getOperand(1))) {
38661 // LHS and RHS swapped due to
38662 // setcc outputting 1 when AND resulted in 0 and vice versa.
38663 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
38664 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
38665 }
38666 }
38667
38668 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
38669 // lowering on KNL. In this case we convert it to
38670 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
38671 // The same situation all vectors of i8 and i16 without BWI.
38672 // Make sure we extend these even before type legalization gets a chance to
38673 // split wide vectors.
38674 // Since SKX these selects have a proper lowering.
38675 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
38676 CondVT.getVectorElementType() == MVT::i1 &&
38677 (VT.getVectorElementType() == MVT::i8 ||
38678 VT.getVectorElementType() == MVT::i16)) {
38679 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
38680 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
38681 }
38682
38683 // AVX512 - Extend select with zero to merge with target shuffle.
38684 // select(mask, extract_subvector(shuffle(x)), zero) -->
38685 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
38686 // TODO - support non target shuffles as well.
38687 if (Subtarget.hasAVX512() && CondVT.isVector() &&
38688 CondVT.getVectorElementType() == MVT::i1) {
38689 auto SelectableOp = [&TLI](SDValue Op) {
38690 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38691 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
38692 isNullConstant(Op.getOperand(1)) &&
38693 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
38694 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
38695 };
38696
38697 bool SelectableLHS = SelectableOp(LHS);
38698 bool SelectableRHS = SelectableOp(RHS);
38699 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
38700 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
38701
38702 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
38703 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
38704 : RHS.getOperand(0).getValueType();
38705 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38706 EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
38707 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
38708 VT.getSizeInBits());
38709 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
38710 VT.getSizeInBits());
38711 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
38712 DAG.getUNDEF(SrcCondVT), Cond,
38713 DAG.getIntPtrConstant(0, DL));
38714 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
38715 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
38716 }
38717 }
38718
38719 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
38720 return V;
38721
38722 // Canonicalize max and min:
38723 // (x > y) ? x : y -> (x >= y) ? x : y
38724 // (x < y) ? x : y -> (x <= y) ? x : y
38725 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
38726 // the need for an extra compare
38727 // against zero. e.g.
38728 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
38729 // subl %esi, %edi
38730 // testl %edi, %edi
38731 // movl $0, %eax
38732 // cmovgl %edi, %eax
38733 // =>
38734 // xorl %eax, %eax
38735 // subl %esi, $edi
38736 // cmovsl %eax, %edi
38737 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
38738 Cond.hasOneUse() &&
38739 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
38740 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
38741 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38742 switch (CC) {
38743 default: break;
38744 case ISD::SETLT:
38745 case ISD::SETGT: {
38746 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
38747 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
38748 Cond.getOperand(0), Cond.getOperand(1), NewCC);
38749 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
38750 }
38751 }
38752 }
38753
38754 // Match VSELECTs into subs with unsigned saturation.
38755 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
38756 // psubus is available in SSE2 for i8 and i16 vectors.
38757 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
38758 isPowerOf2_32(VT.getVectorNumElements()) &&
38759 (VT.getVectorElementType() == MVT::i8 ||
38760 VT.getVectorElementType() == MVT::i16)) {
38761 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38762
38763 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
38764 // left side invert the predicate to simplify logic below.
38765 SDValue Other;
38766 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
38767 Other = RHS;
38768 CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
38769 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
38770 Other = LHS;
38771 }
38772
38773 if (Other.getNode() && Other->getNumOperands() == 2 &&
38774 Other->getOperand(0) == Cond.getOperand(0)) {
38775 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
38776 SDValue CondRHS = Cond->getOperand(1);
38777
38778 // Look for a general sub with unsigned saturation first.
38779 // x >= y ? x-y : 0 --> subus x, y
38780 // x > y ? x-y : 0 --> subus x, y
38781 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
38782 Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
38783 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38784
38785 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
38786 if (isa<BuildVectorSDNode>(CondRHS)) {
38787 // If the RHS is a constant we have to reverse the const
38788 // canonicalization.
38789 // x > C-1 ? x+-C : 0 --> subus x, C
38790 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
38791 return (!Op && !Cond) ||
38792 (Op && Cond &&
38793 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
38794 };
38795 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
38796 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
38797 /*AllowUndefs*/ true)) {
38798 OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
38799 OpRHS);
38800 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38801 }
38802
38803 // Another special case: If C was a sign bit, the sub has been
38804 // canonicalized into a xor.
38805 // FIXME: Would it be better to use computeKnownBits to determine
38806 // whether it's safe to decanonicalize the xor?
38807 // x s< 0 ? x^C : 0 --> subus x, C
38808 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
38809 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
38810 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
38811 OpRHSConst->getAPIntValue().isSignMask()) {
38812 // Note that we have to rebuild the RHS constant here to ensure we
38813 // don't rely on particular values of undef lanes.
38814 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
38815 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38816 }
38817 }
38818 }
38819 }
38820 }
38821 }
38822
38823 // Match VSELECTs into add with unsigned saturation.
38824 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
38825 // paddus is available in SSE2 for i8 and i16 vectors.
38826 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
38827 isPowerOf2_32(VT.getVectorNumElements()) &&
38828 (VT.getVectorElementType() == MVT::i8 ||
38829 VT.getVectorElementType() == MVT::i16)) {
38830 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38831
38832 SDValue CondLHS = Cond->getOperand(0);
38833 SDValue CondRHS = Cond->getOperand(1);
38834
38835 // Check if one of the arms of the VSELECT is vector with all bits set.
38836 // If it's on the left side invert the predicate to simplify logic below.
38837 SDValue Other;
38838 if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
38839 Other = RHS;
38840 CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
38841 } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
38842 Other = LHS;
38843 }
38844
38845 if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
38846 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
38847
38848 // Canonicalize condition operands.
38849 if (CC == ISD::SETUGE) {
38850 std::swap(CondLHS, CondRHS);
38851 CC = ISD::SETULE;
38852 }
38853
38854 // We can test against either of the addition operands.
38855 // x <= x+y ? x+y : ~0 --> addus x, y
38856 // x+y >= x ? x+y : ~0 --> addus x, y
38857 if (CC == ISD::SETULE && Other == CondRHS &&
38858 (OpLHS == CondLHS || OpRHS == CondLHS))
38859 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
38860
38861 if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
38862 CondLHS == OpLHS) {
38863 // If the RHS is a constant we have to reverse the const
38864 // canonicalization.
38865 // x > ~C ? x+C : ~0 --> addus x, C
38866 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
38867 return Cond->getAPIntValue() == ~Op->getAPIntValue();
38868 };
38869 if (CC == ISD::SETULE &&
38870 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
38871 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
38872 }
38873 }
38874 }
38875
38876 // Check if the first operand is all zeros and Cond type is vXi1.
38877 // If this an avx512 target we can improve the use of zero masking by
38878 // swapping the operands and inverting the condition.
38879 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
38880 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
38881 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
38882 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
38883 // Invert the cond to not(cond) : xor(op,allones)=not(op)
38884 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
38885 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
38886 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
38887 }
38888
38889 // Early exit check
38890 if (!TLI.isTypeLegal(VT))
38891 return SDValue();
38892
38893 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
38894 return V;
38895
38896 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
38897 return V;
38898
38899 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
38900 return V;
38901
38902 // select(~Cond, X, Y) -> select(Cond, Y, X)
38903 if (CondVT.getScalarType() != MVT::i1)
38904 if (SDValue CondNot = IsNOT(Cond, DAG))
38905 return DAG.getNode(N->getOpcode(), DL, VT,
38906 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
38907
38908 // Try to optimize vXi1 selects if both operands are either all constants or
38909 // bitcasts from scalar integer type. In that case we can convert the operands
38910 // to integer and use an integer select which will be converted to a CMOV.
38911 // We need to take a little bit of care to avoid creating an i64 type after
38912 // type legalization.
38913 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
38914 VT.getVectorElementType() == MVT::i1 &&
38915 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
38916 MVT IntVT = MVT::getIntegerVT(VT.getVectorNumElements());
38917 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
38918 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
38919
38920 if ((LHSIsConst ||
38921 (LHS.getOpcode() == ISD::BITCAST &&
38922 LHS.getOperand(0).getValueType() == IntVT)) &&
38923 (RHSIsConst ||
38924 (RHS.getOpcode() == ISD::BITCAST &&
38925 RHS.getOperand(0).getValueType() == IntVT))) {
38926 if (LHSIsConst)
38927 LHS = combinevXi1ConstantToInteger(LHS, DAG);
38928 else
38929 LHS = LHS.getOperand(0);
38930
38931 if (RHSIsConst)
38932 RHS = combinevXi1ConstantToInteger(RHS, DAG);
38933 else
38934 RHS = RHS.getOperand(0);
38935
38936 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
38937 return DAG.getBitcast(VT, Select);
38938 }
38939 }
38940
38941 return SDValue();
38942}
38943
38944/// Combine:
38945/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
38946/// to:
38947/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
38948/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
38949/// Note that this is only legal for some op/cc combinations.
38950static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
38951 SelectionDAG &DAG,
38952 const X86Subtarget &Subtarget) {
38953 // This combine only operates on CMP-like nodes.
38954 if (!(Cmp.getOpcode() == X86ISD::CMP ||
38955 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
38956 return SDValue();
38957
38958 // Can't replace the cmp if it has more uses than the one we're looking at.
38959 // FIXME: We would like to be able to handle this, but would need to make sure
38960 // all uses were updated.
38961 if (!Cmp.hasOneUse())
38962 return SDValue();
38963
38964 // This only applies to variations of the common case:
38965 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
38966 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
38967 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
38968 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
38969 // Using the proper condcodes (see below), overflow is checked for.
38970
38971 // FIXME: We can generalize both constraints:
38972 // - XOR/OR/AND (if they were made to survive AtomicExpand)
38973 // - LHS != 1
38974 // if the result is compared.
38975
38976 SDValue CmpLHS = Cmp.getOperand(0);
38977 SDValue CmpRHS = Cmp.getOperand(1);
38978
38979 if (!CmpLHS.hasOneUse())
38980 return SDValue();
38981
38982 unsigned Opc = CmpLHS.getOpcode();
38983 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
38984 return SDValue();
38985
38986 SDValue OpRHS = CmpLHS.getOperand(2);
38987 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
38988 if (!OpRHSC)
38989 return SDValue();
38990
38991 APInt Addend = OpRHSC->getAPIntValue();
38992 if (Opc == ISD::ATOMIC_LOAD_SUB)
38993 Addend = -Addend;
38994
38995 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
38996 if (!CmpRHSC)
38997 return SDValue();
38998
38999 APInt Comparison = CmpRHSC->getAPIntValue();
39000
39001 // If the addend is the negation of the comparison value, then we can do
39002 // a full comparison by emitting the atomic arithmetic as a locked sub.
39003 if (Comparison == -Addend) {
39004 // The CC is fine, but we need to rewrite the LHS of the comparison as an
39005 // atomic sub.
39006 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
39007 auto AtomicSub = DAG.getAtomic(
39008 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
39009 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
39010 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
39011 AN->getMemOperand());
39012 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
39013 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
39014 DAG.getUNDEF(CmpLHS.getValueType()));
39015 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
39016 return LockOp;
39017 }
39018
39019 // We can handle comparisons with zero in a number of cases by manipulating
39020 // the CC used.
39021 if (!Comparison.isNullValue())
39022 return SDValue();
39023
39024 if (CC == X86::COND_S && Addend == 1)
39025 CC = X86::COND_LE;
39026 else if (CC == X86::COND_NS && Addend == 1)
39027 CC = X86::COND_G;
39028 else if (CC == X86::COND_G && Addend == -1)
39029 CC = X86::COND_GE;
39030 else if (CC == X86::COND_LE && Addend == -1)
39031 CC = X86::COND_L;
39032 else
39033 return SDValue();
39034
39035 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
39036 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
39037 DAG.getUNDEF(CmpLHS.getValueType()));
39038 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
39039 return LockOp;
39040}
39041
39042// Check whether a boolean test is testing a boolean value generated by
39043// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
39044// code.
39045//
39046// Simplify the following patterns:
39047// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
39048// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
39049// to (Op EFLAGS Cond)
39050//
39051// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
39052// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
39053// to (Op EFLAGS !Cond)
39054//
39055// where Op could be BRCOND or CMOV.
39056//
39057static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
39058 // This combine only operates on CMP-like nodes.
39059 if (!(Cmp.getOpcode() == X86ISD::CMP ||
39060 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
39061 return SDValue();
39062
39063 // Quit if not used as a boolean value.
39064 if (CC != X86::COND_E && CC != X86::COND_NE)
39065 return SDValue();
39066
39067 // Check CMP operands. One of them should be 0 or 1 and the other should be
39068 // an SetCC or extended from it.
39069 SDValue Op1 = Cmp.getOperand(0);
39070 SDValue Op2 = Cmp.getOperand(1);
39071
39072 SDValue SetCC;
39073 const ConstantSDNode* C = nullptr;
39074 bool needOppositeCond = (CC == X86::COND_E);
39075 bool checkAgainstTrue = false; // Is it a comparison against 1?
39076
39077 if ((C = dyn_cast<ConstantSDNode>(Op1)))
39078 SetCC = Op2;
39079 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
39080 SetCC = Op1;
39081 else // Quit if all operands are not constants.
39082 return SDValue();
39083
39084 if (C->getZExtValue() == 1) {
39085 needOppositeCond = !needOppositeCond;
39086 checkAgainstTrue = true;
39087 } else if (C->getZExtValue() != 0)
39088 // Quit if the constant is neither 0 or 1.
39089 return SDValue();
39090
39091 bool truncatedToBoolWithAnd = false;
39092 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
39093 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
39094 SetCC.getOpcode() == ISD::TRUNCATE ||
39095 SetCC.getOpcode() == ISD::AND) {
39096 if (SetCC.getOpcode() == ISD::AND) {
39097 int OpIdx = -1;
39098 if (isOneConstant(SetCC.getOperand(0)))
39099 OpIdx = 1;
39100 if (isOneConstant(SetCC.getOperand(1)))
39101 OpIdx = 0;
39102 if (OpIdx < 0)
39103 break;
39104 SetCC = SetCC.getOperand(OpIdx);
39105 truncatedToBoolWithAnd = true;
39106 } else
39107 SetCC = SetCC.getOperand(0);
39108 }
39109
39110 switch (SetCC.getOpcode()) {
39111 case X86ISD::SETCC_CARRY:
39112 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
39113 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
39114 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
39115 // truncated to i1 using 'and'.
39116 if (checkAgainstTrue && !truncatedToBoolWithAnd)
39117 break;
39118 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39119, __PRETTY_FUNCTION__))
39119 "Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39119, __PRETTY_FUNCTION__))
;
39120 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39121 case X86ISD::SETCC:
39122 // Set the condition code or opposite one if necessary.
39123 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
39124 if (needOppositeCond)
39125 CC = X86::GetOppositeBranchCondition(CC);
39126 return SetCC.getOperand(1);
39127 case X86ISD::CMOV: {
39128 // Check whether false/true value has canonical one, i.e. 0 or 1.
39129 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
39130 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
39131 // Quit if true value is not a constant.
39132 if (!TVal)
39133 return SDValue();
39134 // Quit if false value is not a constant.
39135 if (!FVal) {
39136 SDValue Op = SetCC.getOperand(0);
39137 // Skip 'zext' or 'trunc' node.
39138 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
39139 Op.getOpcode() == ISD::TRUNCATE)
39140 Op = Op.getOperand(0);
39141 // A special case for rdrand/rdseed, where 0 is set if false cond is
39142 // found.
39143 if ((Op.getOpcode() != X86ISD::RDRAND &&
39144 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
39145 return SDValue();
39146 }
39147 // Quit if false value is not the constant 0 or 1.
39148 bool FValIsFalse = true;
39149 if (FVal && FVal->getZExtValue() != 0) {
39150 if (FVal->getZExtValue() != 1)
39151 return SDValue();
39152 // If FVal is 1, opposite cond is needed.
39153 needOppositeCond = !needOppositeCond;
39154 FValIsFalse = false;
39155 }
39156 // Quit if TVal is not the constant opposite of FVal.
39157 if (FValIsFalse && TVal->getZExtValue() != 1)
39158 return SDValue();
39159 if (!FValIsFalse && TVal->getZExtValue() != 0)
39160 return SDValue();
39161 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
39162 if (needOppositeCond)
39163 CC = X86::GetOppositeBranchCondition(CC);
39164 return SetCC.getOperand(3);
39165 }
39166 }
39167
39168 return SDValue();
39169}
39170
39171/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
39172/// Match:
39173/// (X86or (X86setcc) (X86setcc))
39174/// (X86cmp (and (X86setcc) (X86setcc)), 0)
39175static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
39176 X86::CondCode &CC1, SDValue &Flags,
39177 bool &isAnd) {
39178 if (Cond->getOpcode() == X86ISD::CMP) {
39179 if (!isNullConstant(Cond->getOperand(1)))
39180 return false;
39181
39182 Cond = Cond->getOperand(0);
39183 }
39184
39185 isAnd = false;
39186
39187 SDValue SetCC0, SetCC1;
39188 switch (Cond->getOpcode()) {
39189 default: return false;
39190 case ISD::AND:
39191 case X86ISD::AND:
39192 isAnd = true;
39193 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39194 case ISD::OR:
39195 case X86ISD::OR:
39196 SetCC0 = Cond->getOperand(0);
39197 SetCC1 = Cond->getOperand(1);
39198 break;
39199 };
39200
39201 // Make sure we have SETCC nodes, using the same flags value.
39202 if (SetCC0.getOpcode() != X86ISD::SETCC ||
39203 SetCC1.getOpcode() != X86ISD::SETCC ||
39204 SetCC0->getOperand(1) != SetCC1->getOperand(1))
39205 return false;
39206
39207 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
39208 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
39209 Flags = SetCC0->getOperand(1);
39210 return true;
39211}
39212
39213// When legalizing carry, we create carries via add X, -1
39214// If that comes from an actual carry, via setcc, we use the
39215// carry directly.
39216static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
39217 if (EFLAGS.getOpcode() == X86ISD::ADD) {
39218 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
39219 SDValue Carry = EFLAGS.getOperand(0);
39220 while (Carry.getOpcode() == ISD::TRUNCATE ||
39221 Carry.getOpcode() == ISD::ZERO_EXTEND ||
39222 Carry.getOpcode() == ISD::SIGN_EXTEND ||
39223 Carry.getOpcode() == ISD::ANY_EXTEND ||
39224 (Carry.getOpcode() == ISD::AND &&
39225 isOneConstant(Carry.getOperand(1))))
39226 Carry = Carry.getOperand(0);
39227 if (Carry.getOpcode() == X86ISD::SETCC ||
39228 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
39229 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
39230 uint64_t CarryCC = Carry.getConstantOperandVal(0);
39231 SDValue CarryOp1 = Carry.getOperand(1);
39232 if (CarryCC == X86::COND_B)
39233 return CarryOp1;
39234 if (CarryCC == X86::COND_A) {
39235 // Try to convert COND_A into COND_B in an attempt to facilitate
39236 // materializing "setb reg".
39237 //
39238 // Do not flip "e > c", where "c" is a constant, because Cmp
39239 // instruction cannot take an immediate as its first operand.
39240 //
39241 if (CarryOp1.getOpcode() == X86ISD::SUB &&
39242 CarryOp1.getNode()->hasOneUse() &&
39243 CarryOp1.getValueType().isInteger() &&
39244 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
39245 SDValue SubCommute =
39246 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
39247 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
39248 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
39249 }
39250 }
39251 // If this is a check of the z flag of an add with 1, switch to the
39252 // C flag.
39253 if (CarryCC == X86::COND_E &&
39254 CarryOp1.getOpcode() == X86ISD::ADD &&
39255 isOneConstant(CarryOp1.getOperand(1)))
39256 return CarryOp1;
39257 }
39258 }
39259 }
39260
39261 return SDValue();
39262}
39263
39264/// Optimize an EFLAGS definition used according to the condition code \p CC
39265/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
39266/// uses of chain values.
39267static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
39268 SelectionDAG &DAG,
39269 const X86Subtarget &Subtarget) {
39270 if (CC == X86::COND_B)
39271 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
39272 return Flags;
39273
39274 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
39275 return R;
39276 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
39277}
39278
39279/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
39280static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
39281 TargetLowering::DAGCombinerInfo &DCI,
39282 const X86Subtarget &Subtarget) {
39283 SDLoc DL(N);
39284
39285 SDValue FalseOp = N->getOperand(0);
39286 SDValue TrueOp = N->getOperand(1);
39287 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
39288 SDValue Cond = N->getOperand(3);
39289
39290 // cmov X, X, ?, ? --> X
39291 if (TrueOp == FalseOp)
39292 return TrueOp;
39293
39294 // Try to simplify the EFLAGS and condition code operands.
39295 // We can't always do this as FCMOV only supports a subset of X86 cond.
39296 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
39297 if (!(FalseOp.getValueType() == MVT::f80 ||
39298 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
39299 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
39300 !Subtarget.hasCMov() || hasFPCMov(CC)) {
39301 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
39302 Flags};
39303 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39304 }
39305 }
39306
39307 // If this is a select between two integer constants, try to do some
39308 // optimizations. Note that the operands are ordered the opposite of SELECT
39309 // operands.
39310 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
39311 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
39312 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
39313 // larger than FalseC (the false value).
39314 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
39315 CC = X86::GetOppositeBranchCondition(CC);
39316 std::swap(TrueC, FalseC);
39317 std::swap(TrueOp, FalseOp);
39318 }
39319
39320 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
39321 // This is efficient for any integer data type (including i8/i16) and
39322 // shift amount.
39323 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
39324 Cond = getSETCC(CC, Cond, DL, DAG);
39325
39326 // Zero extend the condition if needed.
39327 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
39328
39329 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
39330 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
39331 DAG.getConstant(ShAmt, DL, MVT::i8));
39332 return Cond;
39333 }
39334
39335 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
39336 // for any integer data type, including i8/i16.
39337 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
39338 Cond = getSETCC(CC, Cond, DL, DAG);
39339
39340 // Zero extend the condition if needed.
39341 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
39342 FalseC->getValueType(0), Cond);
39343 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
39344 SDValue(FalseC, 0));
39345 return Cond;
39346 }
39347
39348 // Optimize cases that will turn into an LEA instruction. This requires
39349 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
39350 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
39351 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
39352 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39353, __PRETTY_FUNCTION__))
39353 "Implicit constant truncation")((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39353, __PRETTY_FUNCTION__))
;
39354
39355 bool isFastMultiplier = false;
39356 if (Diff.ult(10)) {
39357 switch (Diff.getZExtValue()) {
39358 default: break;
39359 case 1: // result = add base, cond
39360 case 2: // result = lea base( , cond*2)
39361 case 3: // result = lea base(cond, cond*2)
39362 case 4: // result = lea base( , cond*4)
39363 case 5: // result = lea base(cond, cond*4)
39364 case 8: // result = lea base( , cond*8)
39365 case 9: // result = lea base(cond, cond*8)
39366 isFastMultiplier = true;
39367 break;
39368 }
39369 }
39370
39371 if (isFastMultiplier) {
39372 Cond = getSETCC(CC, Cond, DL ,DAG);
39373 // Zero extend the condition if needed.
39374 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
39375 Cond);
39376 // Scale the condition by the difference.
39377 if (Diff != 1)
39378 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
39379 DAG.getConstant(Diff, DL, Cond.getValueType()));
39380
39381 // Add the base if non-zero.
39382 if (FalseC->getAPIntValue() != 0)
39383 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
39384 SDValue(FalseC, 0));
39385 return Cond;
39386 }
39387 }
39388 }
39389 }
39390
39391 // Handle these cases:
39392 // (select (x != c), e, c) -> select (x != c), e, x),
39393 // (select (x == c), c, e) -> select (x == c), x, e)
39394 // where the c is an integer constant, and the "select" is the combination
39395 // of CMOV and CMP.
39396 //
39397 // The rationale for this change is that the conditional-move from a constant
39398 // needs two instructions, however, conditional-move from a register needs
39399 // only one instruction.
39400 //
39401 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
39402 // some instruction-combining opportunities. This opt needs to be
39403 // postponed as late as possible.
39404 //
39405 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
39406 // the DCI.xxxx conditions are provided to postpone the optimization as
39407 // late as possible.
39408
39409 ConstantSDNode *CmpAgainst = nullptr;
39410 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
39411 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
39412 !isa<ConstantSDNode>(Cond.getOperand(0))) {
39413
39414 if (CC == X86::COND_NE &&
39415 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
39416 CC = X86::GetOppositeBranchCondition(CC);
39417 std::swap(TrueOp, FalseOp);
39418 }
39419
39420 if (CC == X86::COND_E &&
39421 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
39422 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
39423 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
39424 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39425 }
39426 }
39427 }
39428
39429 // Fold and/or of setcc's to double CMOV:
39430 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
39431 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
39432 //
39433 // This combine lets us generate:
39434 // cmovcc1 (jcc1 if we don't have CMOV)
39435 // cmovcc2 (same)
39436 // instead of:
39437 // setcc1
39438 // setcc2
39439 // and/or
39440 // cmovne (jne if we don't have CMOV)
39441 // When we can't use the CMOV instruction, it might increase branch
39442 // mispredicts.
39443 // When we can use CMOV, or when there is no mispredict, this improves
39444 // throughput and reduces register pressure.
39445 //
39446 if (CC == X86::COND_NE) {
39447 SDValue Flags;
39448 X86::CondCode CC0, CC1;
39449 bool isAndSetCC;
39450 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
39451 if (isAndSetCC) {
39452 std::swap(FalseOp, TrueOp);
39453 CC0 = X86::GetOppositeBranchCondition(CC0);
39454 CC1 = X86::GetOppositeBranchCondition(CC1);
39455 }
39456
39457 SDValue LOps[] = {FalseOp, TrueOp,
39458 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
39459 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
39460 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
39461 Flags};
39462 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39463 return CMOV;
39464 }
39465 }
39466
39467 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
39468 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
39469 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
39470 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
39471 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
39472 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
39473 SDValue Add = TrueOp;
39474 SDValue Const = FalseOp;
39475 // Canonicalize the condition code for easier matching and output.
39476 if (CC == X86::COND_E)
39477 std::swap(Add, Const);
39478
39479 // We might have replaced the constant in the cmov with the LHS of the
39480 // compare. If so change it to the RHS of the compare.
39481 if (Const == Cond.getOperand(0))
39482 Const = Cond.getOperand(1);
39483
39484 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
39485 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
39486 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
39487 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
39488 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
39489 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
39490 EVT VT = N->getValueType(0);
39491 // This should constant fold.
39492 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
39493 SDValue CMov =
39494 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
39495 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
39496 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
39497 }
39498 }
39499
39500 return SDValue();
39501}
39502
39503/// Different mul shrinking modes.
39504enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
39505
39506static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
39507 EVT VT = N->getOperand(0).getValueType();
39508 if (VT.getScalarSizeInBits() != 32)
39509 return false;
39510
39511 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39511, __PRETTY_FUNCTION__))
;
39512 unsigned SignBits[2] = {1, 1};
39513 bool IsPositive[2] = {false, false};
39514 for (unsigned i = 0; i < 2; i++) {
39515 SDValue Opd = N->getOperand(i);
39516
39517 SignBits[i] = DAG.ComputeNumSignBits(Opd);
39518 IsPositive[i] = DAG.SignBitIsZero(Opd);
39519 }
39520
39521 bool AllPositive = IsPositive[0] && IsPositive[1];
39522 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
39523 // When ranges are from -128 ~ 127, use MULS8 mode.
39524 if (MinSignBits >= 25)
39525 Mode = ShrinkMode::MULS8;
39526 // When ranges are from 0 ~ 255, use MULU8 mode.
39527 else if (AllPositive && MinSignBits >= 24)
39528 Mode = ShrinkMode::MULU8;
39529 // When ranges are from -32768 ~ 32767, use MULS16 mode.
39530 else if (MinSignBits >= 17)
39531 Mode = ShrinkMode::MULS16;
39532 // When ranges are from 0 ~ 65535, use MULU16 mode.
39533 else if (AllPositive && MinSignBits >= 16)
39534 Mode = ShrinkMode::MULU16;
39535 else
39536 return false;
39537 return true;
39538}
39539
39540/// When the operands of vector mul are extended from smaller size values,
39541/// like i8 and i16, the type of mul may be shrinked to generate more
39542/// efficient code. Two typical patterns are handled:
39543/// Pattern1:
39544/// %2 = sext/zext <N x i8> %1 to <N x i32>
39545/// %4 = sext/zext <N x i8> %3 to <N x i32>
39546// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
39547/// %5 = mul <N x i32> %2, %4
39548///
39549/// Pattern2:
39550/// %2 = zext/sext <N x i16> %1 to <N x i32>
39551/// %4 = zext/sext <N x i16> %3 to <N x i32>
39552/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
39553/// %5 = mul <N x i32> %2, %4
39554///
39555/// There are four mul shrinking modes:
39556/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
39557/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
39558/// generate pmullw+sext32 for it (MULS8 mode).
39559/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
39560/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
39561/// generate pmullw+zext32 for it (MULU8 mode).
39562/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
39563/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
39564/// generate pmullw+pmulhw for it (MULS16 mode).
39565/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
39566/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
39567/// generate pmullw+pmulhuw for it (MULU16 mode).
39568static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
39569 const X86Subtarget &Subtarget) {
39570 // Check for legality
39571 // pmullw/pmulhw are not supported by SSE.
39572 if (!Subtarget.hasSSE2())
39573 return SDValue();
39574
39575 // Check for profitability
39576 // pmulld is supported since SSE41. It is better to use pmulld
39577 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
39578 // the expansion.
39579 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
39580 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
39581 return SDValue();
39582
39583 ShrinkMode Mode;
39584 if (!canReduceVMulWidth(N, DAG, Mode))
39585 return SDValue();
39586
39587 SDLoc DL(N);
39588 SDValue N0 = N->getOperand(0);
39589 SDValue N1 = N->getOperand(1);
39590 EVT VT = N->getOperand(0).getValueType();
39591 unsigned NumElts = VT.getVectorNumElements();
39592 if ((NumElts % 2) != 0)
39593 return SDValue();
39594
39595 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
39596
39597 // Shrink the operands of mul.
39598 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
39599 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
39600
39601 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
39602 // lower part is needed.
39603 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
39604 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
39605 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
39606 : ISD::SIGN_EXTEND,
39607 DL, VT, MulLo);
39608
39609 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
39610 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
39611 // the higher part is also needed.
39612 SDValue MulHi =
39613 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
39614 ReducedVT, NewN0, NewN1);
39615
39616 // Repack the lower part and higher part result of mul into a wider
39617 // result.
39618 // Generate shuffle functioning as punpcklwd.
39619 SmallVector<int, 16> ShuffleMask(NumElts);
39620 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
39621 ShuffleMask[2 * i] = i;
39622 ShuffleMask[2 * i + 1] = i + NumElts;
39623 }
39624 SDValue ResLo =
39625 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
39626 ResLo = DAG.getBitcast(ResVT, ResLo);
39627 // Generate shuffle functioning as punpckhwd.
39628 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
39629 ShuffleMask[2 * i] = i + NumElts / 2;
39630 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
39631 }
39632 SDValue ResHi =
39633 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
39634 ResHi = DAG.getBitcast(ResVT, ResHi);
39635 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
39636}
39637
39638static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
39639 EVT VT, const SDLoc &DL) {
39640
39641 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
39642 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39643 DAG.getConstant(Mult, DL, VT));
39644 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
39645 DAG.getConstant(Shift, DL, MVT::i8));
39646 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
39647 N->getOperand(0));
39648 return Result;
39649 };
39650
39651 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
39652 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39653 DAG.getConstant(Mul1, DL, VT));
39654 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
39655 DAG.getConstant(Mul2, DL, VT));
39656 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
39657 N->getOperand(0));
39658 return Result;
39659 };
39660
39661 switch (MulAmt) {
39662 default:
39663 break;
39664 case 11:
39665 // mul x, 11 => add ((shl (mul x, 5), 1), x)
39666 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
39667 case 21:
39668 // mul x, 21 => add ((shl (mul x, 5), 2), x)
39669 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
39670 case 41:
39671 // mul x, 41 => add ((shl (mul x, 5), 3), x)
39672 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
39673 case 22:
39674 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
39675 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
39676 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
39677 case 19:
39678 // mul x, 19 => add ((shl (mul x, 9), 1), x)
39679 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
39680 case 37:
39681 // mul x, 37 => add ((shl (mul x, 9), 2), x)
39682 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
39683 case 73:
39684 // mul x, 73 => add ((shl (mul x, 9), 3), x)
39685 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
39686 case 13:
39687 // mul x, 13 => add ((shl (mul x, 3), 2), x)
39688 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
39689 case 23:
39690 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
39691 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
39692 case 26:
39693 // mul x, 26 => add ((mul (mul x, 5), 5), x)
39694 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
39695 case 28:
39696 // mul x, 28 => add ((mul (mul x, 9), 3), x)
39697 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
39698 case 29:
39699 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
39700 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
39701 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
39702 }
39703
39704 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
39705 // by a single LEA.
39706 // First check if this a sum of two power of 2s because that's easy. Then
39707 // count how many zeros are up to the first bit.
39708 // TODO: We can do this even without LEA at a cost of two shifts and an add.
39709 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
39710 unsigned ScaleShift = countTrailingZeros(MulAmt);
39711 if (ScaleShift >= 1 && ScaleShift < 4) {
39712 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
39713 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39714 DAG.getConstant(ShiftAmt, DL, MVT::i8));
39715 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39716 DAG.getConstant(ScaleShift, DL, MVT::i8));
39717 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
39718 }
39719 }
39720
39721 return SDValue();
39722}
39723
39724// If the upper 17 bits of each element are zero then we can use PMADDWD,
39725// which is always at least as quick as PMULLD, except on KNL.
39726static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
39727 const X86Subtarget &Subtarget) {
39728 if (!Subtarget.hasSSE2())
39729 return SDValue();
39730
39731 if (Subtarget.isPMADDWDSlow())
39732 return SDValue();
39733
39734 EVT VT = N->getValueType(0);
39735
39736 // Only support vXi32 vectors.
39737 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
39738 return SDValue();
39739
39740 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
39741 // Also allow v2i32 if it will be widened.
39742 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
39743 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
39744 return SDValue();
39745
39746 SDValue N0 = N->getOperand(0);
39747 SDValue N1 = N->getOperand(1);
39748
39749 // If we are zero extending two steps without SSE4.1, its better to reduce
39750 // the vmul width instead.
39751 if (!Subtarget.hasSSE41() &&
39752 (N0.getOpcode() == ISD::ZERO_EXTEND &&
39753 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
39754 (N1.getOpcode() == ISD::ZERO_EXTEND &&
39755 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
39756 return SDValue();
39757
39758 APInt Mask17 = APInt::getHighBitsSet(32, 17);
39759 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
39760 !DAG.MaskedValueIsZero(N0, Mask17))
39761 return SDValue();
39762
39763 // Use SplitOpsAndApply to handle AVX splitting.
39764 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39765 ArrayRef<SDValue> Ops) {
39766 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
39767 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
39768 };
39769 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
39770 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
39771 PMADDWDBuilder);
39772}
39773
39774static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
39775 const X86Subtarget &Subtarget) {
39776 if (!Subtarget.hasSSE2())
39777 return SDValue();
39778
39779 EVT VT = N->getValueType(0);
39780
39781 // Only support vXi64 vectors.
39782 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
39783 VT.getVectorNumElements() < 2 ||
39784 !isPowerOf2_32(VT.getVectorNumElements()))
39785 return SDValue();
39786
39787 SDValue N0 = N->getOperand(0);
39788 SDValue N1 = N->getOperand(1);
39789
39790 // MULDQ returns the 64-bit result of the signed multiplication of the lower
39791 // 32-bits. We can lower with this if the sign bits stretch that far.
39792 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
39793 DAG.ComputeNumSignBits(N1) > 32) {
39794 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39795 ArrayRef<SDValue> Ops) {
39796 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
39797 };
39798 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
39799 PMULDQBuilder, /*CheckBWI*/false);
39800 }
39801
39802 // If the upper bits are zero we can use a single pmuludq.
39803 APInt Mask = APInt::getHighBitsSet(64, 32);
39804 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
39805 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39806 ArrayRef<SDValue> Ops) {
39807 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
39808 };
39809 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
39810 PMULUDQBuilder, /*CheckBWI*/false);
39811 }
39812
39813 return SDValue();
39814}
39815
39816/// Optimize a single multiply with constant into two operations in order to
39817/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
39818static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
39819 TargetLowering::DAGCombinerInfo &DCI,
39820 const X86Subtarget &Subtarget) {
39821 EVT VT = N->getValueType(0);
39822
39823 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
39824 return V;
39825
39826 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
39827 return V;
39828
39829 if (DCI.isBeforeLegalize() && VT.isVector())
39830 return reduceVMULWidth(N, DAG, Subtarget);
39831
39832 if (!MulConstantOptimization)
39833 return SDValue();
39834 // An imul is usually smaller than the alternative sequence.
39835 if (DAG.getMachineFunction().getFunction().hasMinSize())
39836 return SDValue();
39837
39838 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
39839 return SDValue();
39840
39841 if (VT != MVT::i64 && VT != MVT::i32)
39842 return SDValue();
39843
39844 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
39845 if (!C)
39846 return SDValue();
39847 if (isPowerOf2_64(C->getZExtValue()))
39848 return SDValue();
39849
39850 int64_t SignMulAmt = C->getSExtValue();
39851 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!"
) ? static_cast<void> (0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39851, __PRETTY_FUNCTION__))
;
39852 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
39853
39854 SDLoc DL(N);
39855 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
39856 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39857 DAG.getConstant(AbsMulAmt, DL, VT));
39858 if (SignMulAmt < 0)
39859 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
39860 NewMul);
39861
39862 return NewMul;
39863 }
39864
39865 uint64_t MulAmt1 = 0;
39866 uint64_t MulAmt2 = 0;
39867 if ((AbsMulAmt % 9) == 0) {
39868 MulAmt1 = 9;
39869 MulAmt2 = AbsMulAmt / 9;
39870 } else if ((AbsMulAmt % 5) == 0) {
39871 MulAmt1 = 5;
39872 MulAmt2 = AbsMulAmt / 5;
39873 } else if ((AbsMulAmt % 3) == 0) {
39874 MulAmt1 = 3;
39875 MulAmt2 = AbsMulAmt / 3;
39876 }
39877
39878 SDValue NewMul;
39879 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
39880 if (MulAmt2 &&
39881 (isPowerOf2_64(MulAmt2) ||
39882 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
39883
39884 if (isPowerOf2_64(MulAmt2) &&
39885 !(SignMulAmt >= 0 && N->hasOneUse() &&
39886 N->use_begin()->getOpcode() == ISD::ADD))
39887 // If second multiplifer is pow2, issue it first. We want the multiply by
39888 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
39889 // is an add. Only do this for positive multiply amounts since the
39890 // negate would prevent it from being used as an address mode anyway.
39891 std::swap(MulAmt1, MulAmt2);
39892
39893 if (isPowerOf2_64(MulAmt1))
39894 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39895 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
39896 else
39897 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39898 DAG.getConstant(MulAmt1, DL, VT));
39899
39900 if (isPowerOf2_64(MulAmt2))
39901 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
39902 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
39903 else
39904 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
39905 DAG.getConstant(MulAmt2, DL, VT));
39906
39907 // Negate the result.
39908 if (SignMulAmt < 0)
39909 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
39910 NewMul);
39911 } else if (!Subtarget.slowLEA())
39912 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
39913
39914 if (!NewMul) {
39915 assert(C->getZExtValue() != 0 &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39918, __PRETTY_FUNCTION__))
39916 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39918, __PRETTY_FUNCTION__))
39917 "Both cases that could cause potential overflows should have "((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39918, __PRETTY_FUNCTION__))
39918 "already been handled.")((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39918, __PRETTY_FUNCTION__))
;
39919 if (isPowerOf2_64(AbsMulAmt - 1)) {
39920 // (mul x, 2^N + 1) => (add (shl x, N), x)
39921 NewMul = DAG.getNode(
39922 ISD::ADD, DL, VT, N->getOperand(0),
39923 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39924 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
39925 MVT::i8)));
39926 // To negate, subtract the number from zero
39927 if (SignMulAmt < 0)
39928 NewMul = DAG.getNode(ISD::SUB, DL, VT,
39929 DAG.getConstant(0, DL, VT), NewMul);
39930 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
39931 // (mul x, 2^N - 1) => (sub (shl x, N), x)
39932 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39933 DAG.getConstant(Log2_64(AbsMulAmt + 1),
39934 DL, MVT::i8));
39935 // To negate, reverse the operands of the subtract.
39936 if (SignMulAmt < 0)
39937 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
39938 else
39939 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
39940 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
39941 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
39942 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39943 DAG.getConstant(Log2_64(AbsMulAmt - 2),
39944 DL, MVT::i8));
39945 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
39946 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
39947 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
39948 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
39949 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39950 DAG.getConstant(Log2_64(AbsMulAmt + 2),
39951 DL, MVT::i8));
39952 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
39953 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
39954 }
39955 }
39956
39957 return NewMul;
39958}
39959
39960static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
39961 SDValue N0 = N->getOperand(0);
39962 SDValue N1 = N->getOperand(1);
39963 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
39964 EVT VT = N0.getValueType();
39965
39966 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
39967 // since the result of setcc_c is all zero's or all ones.
39968 if (VT.isInteger() && !VT.isVector() &&
39969 N1C && N0.getOpcode() == ISD::AND &&
39970 N0.getOperand(1).getOpcode() == ISD::Constant) {
39971 SDValue N00 = N0.getOperand(0);
39972 APInt Mask = N0.getConstantOperandAPInt(1);
39973 Mask <<= N1C->getAPIntValue();
39974 bool MaskOK = false;
39975 // We can handle cases concerning bit-widening nodes containing setcc_c if
39976 // we carefully interrogate the mask to make sure we are semantics
39977 // preserving.
39978 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
39979 // of the underlying setcc_c operation if the setcc_c was zero extended.
39980 // Consider the following example:
39981 // zext(setcc_c) -> i32 0x0000FFFF
39982 // c1 -> i32 0x0000FFFF
39983 // c2 -> i32 0x00000001
39984 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
39985 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
39986 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
39987 MaskOK = true;
39988 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
39989 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
39990 MaskOK = true;
39991 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
39992 N00.getOpcode() == ISD::ANY_EXTEND) &&
39993 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
39994 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
39995 }
39996 if (MaskOK && Mask != 0) {
39997 SDLoc DL(N);
39998 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
39999 }
40000 }
40001
40002 // Hardware support for vector shifts is sparse which makes us scalarize the
40003 // vector operations in many cases. Also, on sandybridge ADD is faster than
40004 // shl.
40005 // (shl V, 1) -> add V,V
40006 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
40007 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
40008 assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40008, __PRETTY_FUNCTION__))
;
40009 // We shift all of the values by one. In many cases we do not have
40010 // hardware support for this operation. This is better expressed as an ADD
40011 // of two values.
40012 if (N1SplatC->isOne())
40013 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
40014 }
40015
40016 return SDValue();
40017}
40018
40019static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
40020 SDValue N0 = N->getOperand(0);
40021 SDValue N1 = N->getOperand(1);
40022 EVT VT = N0.getValueType();
40023 unsigned Size = VT.getSizeInBits();
40024
40025 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
40026 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
40027 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
40028 // depending on sign of (SarConst - [56,48,32,24,16])
40029
40030 // sexts in X86 are MOVs. The MOVs have the same code size
40031 // as above SHIFTs (only SHIFT on 1 has lower code size).
40032 // However the MOVs have 2 advantages to a SHIFT:
40033 // 1. MOVs can write to a register that differs from source
40034 // 2. MOVs accept memory operands
40035
40036 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
40037 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
40038 N0.getOperand(1).getOpcode() != ISD::Constant)
40039 return SDValue();
40040
40041 SDValue N00 = N0.getOperand(0);
40042 SDValue N01 = N0.getOperand(1);
40043 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
40044 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
40045 EVT CVT = N1.getValueType();
40046
40047 if (SarConst.isNegative())
40048 return SDValue();
40049
40050 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
40051 unsigned ShiftSize = SVT.getSizeInBits();
40052 // skipping types without corresponding sext/zext and
40053 // ShlConst that is not one of [56,48,32,24,16]
40054 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
40055 continue;
40056 SDLoc DL(N);
40057 SDValue NN =
40058 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
40059 SarConst = SarConst - (Size - ShiftSize);
40060 if (SarConst == 0)
40061 return NN;
40062 else if (SarConst.isNegative())
40063 return DAG.getNode(ISD::SHL, DL, VT, NN,
40064 DAG.getConstant(-SarConst, DL, CVT));
40065 else
40066 return DAG.getNode(ISD::SRA, DL, VT, NN,
40067 DAG.getConstant(SarConst, DL, CVT));
40068 }
40069 return SDValue();
40070}
40071
40072static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
40073 TargetLowering::DAGCombinerInfo &DCI) {
40074 SDValue N0 = N->getOperand(0);
40075 SDValue N1 = N->getOperand(1);
40076 EVT VT = N0.getValueType();
40077
40078 // Only do this on the last DAG combine as it can interfere with other
40079 // combines.
40080 if (!DCI.isAfterLegalizeDAG())
40081 return SDValue();
40082
40083 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
40084 // TODO: This is a generic DAG combine that became an x86-only combine to
40085 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
40086 // and-not ('andn').
40087 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
40088 return SDValue();
40089
40090 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
40091 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
40092 if (!ShiftC || !AndC)
40093 return SDValue();
40094
40095 // If we can shrink the constant mask below 8-bits or 32-bits, then this
40096 // transform should reduce code size. It may also enable secondary transforms
40097 // from improved known-bits analysis or instruction selection.
40098 APInt MaskVal = AndC->getAPIntValue();
40099
40100 // If this can be matched by a zero extend, don't optimize.
40101 if (MaskVal.isMask()) {
40102 unsigned TO = MaskVal.countTrailingOnes();
40103 if (TO >= 8 && isPowerOf2_32(TO))
40104 return SDValue();
40105 }
40106
40107 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
40108 unsigned OldMaskSize = MaskVal.getMinSignedBits();
40109 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
40110 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
40111 (OldMaskSize > 32 && NewMaskSize <= 32)) {
40112 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
40113 SDLoc DL(N);
40114 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
40115 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
40116 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
40117 }
40118 return SDValue();
40119}
40120
40121static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
40122 TargetLowering::DAGCombinerInfo &DCI,
40123 const X86Subtarget &Subtarget) {
40124 unsigned Opcode = N->getOpcode();
40125 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40126, __PRETTY_FUNCTION__))
40126 "Unexpected shift opcode")(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40126, __PRETTY_FUNCTION__))
;
40127
40128 EVT VT = N->getValueType(0);
40129 SDValue N0 = N->getOperand(0);
40130 SDValue N1 = N->getOperand(1);
40131 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
40132 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
40133 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40135, __PRETTY_FUNCTION__))
40134 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40135, __PRETTY_FUNCTION__))
40135 "Unexpected PACKSS/PACKUS input type")((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40135, __PRETTY_FUNCTION__))
;
40136
40137 bool IsSigned = (X86ISD::PACKSS == Opcode);
40138
40139 // Constant Folding.
40140 APInt UndefElts0, UndefElts1;
40141 SmallVector<APInt, 32> EltBits0, EltBits1;
40142 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
40143 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
40144 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
40145 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
40146 unsigned NumLanes = VT.getSizeInBits() / 128;
40147 unsigned NumDstElts = VT.getVectorNumElements();
40148 unsigned NumSrcElts = NumDstElts / 2;
40149 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
40150 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
40151
40152 APInt Undefs(NumDstElts, 0);
40153 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
40154 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
40155 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
40156 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
40157 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
40158 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
40159
40160 if (UndefElts[SrcIdx]) {
40161 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
40162 continue;
40163 }
40164
40165 APInt &Val = EltBits[SrcIdx];
40166 if (IsSigned) {
40167 // PACKSS: Truncate signed value with signed saturation.
40168 // Source values less than dst minint are saturated to minint.
40169 // Source values greater than dst maxint are saturated to maxint.
40170 if (Val.isSignedIntN(DstBitsPerElt))
40171 Val = Val.trunc(DstBitsPerElt);
40172 else if (Val.isNegative())
40173 Val = APInt::getSignedMinValue(DstBitsPerElt);
40174 else
40175 Val = APInt::getSignedMaxValue(DstBitsPerElt);
40176 } else {
40177 // PACKUS: Truncate signed value with unsigned saturation.
40178 // Source values less than zero are saturated to zero.
40179 // Source values greater than dst maxuint are saturated to maxuint.
40180 if (Val.isIntN(DstBitsPerElt))
40181 Val = Val.trunc(DstBitsPerElt);
40182 else if (Val.isNegative())
40183 Val = APInt::getNullValue(DstBitsPerElt);
40184 else
40185 Val = APInt::getAllOnesValue(DstBitsPerElt);
40186 }
40187 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
40188 }
40189 }
40190
40191 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
40192 }
40193
40194 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
40195 // truncate to create a larger truncate.
40196 if (Subtarget.hasAVX512() &&
40197 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
40198 N0.getOperand(0).getValueType() == MVT::v8i32) {
40199 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
40200 (!IsSigned &&
40201 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
40202 if (Subtarget.hasVLX())
40203 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
40204
40205 // Widen input to v16i32 so we can truncate that.
40206 SDLoc dl(N);
40207 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
40208 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
40209 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
40210 }
40211 }
40212
40213 // Attempt to combine as shuffle.
40214 SDValue Op(N, 0);
40215 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40216 return Res;
40217
40218 return SDValue();
40219}
40220
40221static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
40222 TargetLowering::DAGCombinerInfo &DCI,
40223 const X86Subtarget &Subtarget) {
40224 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40226, __PRETTY_FUNCTION__))
40225 X86ISD::VSRL == N->getOpcode()) &&(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40226, __PRETTY_FUNCTION__))
40226 "Unexpected shift opcode")(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40226, __PRETTY_FUNCTION__))
;
40227 EVT VT = N->getValueType(0);
40228 SDValue N0 = N->getOperand(0);
40229 SDValue N1 = N->getOperand(1);
40230
40231 // Shift zero -> zero.
40232 if (ISD::isBuildVectorAllZeros(N0.getNode()))
40233 return DAG.getConstant(0, SDLoc(N), VT);
40234
40235 // Detect constant shift amounts.
40236 APInt UndefElts;
40237 SmallVector<APInt, 32> EltBits;
40238 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
40239 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
40240 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
40241 EltBits[0].getZExtValue(), DAG);
40242 }
40243
40244 APInt KnownUndef, KnownZero;
40245 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40246 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
40247 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
40248 KnownZero, DCI))
40249 return SDValue(N, 0);
40250
40251 return SDValue();
40252}
40253
40254static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
40255 TargetLowering::DAGCombinerInfo &DCI,
40256 const X86Subtarget &Subtarget) {
40257 unsigned Opcode = N->getOpcode();
40258 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40260, __PRETTY_FUNCTION__))
40259 X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40260, __PRETTY_FUNCTION__))
40260 "Unexpected shift opcode")(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40260, __PRETTY_FUNCTION__))
;
40261 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
40262 EVT VT = N->getValueType(0);
40263 SDValue N0 = N->getOperand(0);
40264 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
40265 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40266, __PRETTY_FUNCTION__))
40266 "Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40266, __PRETTY_FUNCTION__))
;
40267 assert(N->getOperand(1).getValueType() == MVT::i8 &&((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40268, __PRETTY_FUNCTION__))
40268 "Unexpected shift amount type")((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40268, __PRETTY_FUNCTION__))
;
40269
40270 // Out of range logical bit shifts are guaranteed to be zero.
40271 // Out of range arithmetic bit shifts splat the sign bit.
40272 unsigned ShiftVal = N->getConstantOperandVal(1);
40273 if (ShiftVal >= NumBitsPerElt) {
40274 if (LogicalShift)
40275 return DAG.getConstant(0, SDLoc(N), VT);
40276 else
40277 ShiftVal = NumBitsPerElt - 1;
40278 }
40279
40280 // Shift N0 by zero -> N0.
40281 if (!ShiftVal)
40282 return N0;
40283
40284 // Shift zero -> zero.
40285 if (ISD::isBuildVectorAllZeros(N0.getNode()))
40286 return DAG.getConstant(0, SDLoc(N), VT);
40287
40288 // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
40289 // clamped to (NumBitsPerElt - 1).
40290 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
40291 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
40292 unsigned NewShiftVal = ShiftVal + ShiftVal2;
40293 if (NewShiftVal >= NumBitsPerElt)
40294 NewShiftVal = NumBitsPerElt - 1;
40295 return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
40296 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
40297 }
40298
40299 // We can decode 'whole byte' logical bit shifts as shuffles.
40300 if (LogicalShift && (ShiftVal % 8) == 0) {
40301 SDValue Op(N, 0);
40302 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40303 return Res;
40304 }
40305
40306 // Constant Folding.
40307 APInt UndefElts;
40308 SmallVector<APInt, 32> EltBits;
40309 if (N->isOnlyUserOf(N0.getNode()) &&
40310 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
40311 assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40312, __PRETTY_FUNCTION__))
40312 "Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40312, __PRETTY_FUNCTION__))
;
40313 for (APInt &Elt : EltBits) {
40314 if (X86ISD::VSHLI == Opcode)
40315 Elt <<= ShiftVal;
40316 else if (X86ISD::VSRAI == Opcode)
40317 Elt.ashrInPlace(ShiftVal);
40318 else
40319 Elt.lshrInPlace(ShiftVal);
40320 }
40321 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
40322 }
40323
40324 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40325 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
40326 APInt::getAllOnesValue(NumBitsPerElt), DCI))
40327 return SDValue(N, 0);
40328
40329 return SDValue();
40330}
40331
40332static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
40333 TargetLowering::DAGCombinerInfo &DCI,
40334 const X86Subtarget &Subtarget) {
40335 EVT VT = N->getValueType(0);
40336 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40338, __PRETTY_FUNCTION__))
40337 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40338, __PRETTY_FUNCTION__))
40338 "Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40338, __PRETTY_FUNCTION__))
;
40339
40340 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
40341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40342 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
40343 APInt::getAllOnesValue(NumBitsPerElt), DCI))
40344 return SDValue(N, 0);
40345
40346 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
40347 SDValue Op(N, 0);
40348 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40349 return Res;
40350
40351 return SDValue();
40352}
40353
40354/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
40355/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
40356/// OR -> CMPNEQSS.
40357static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
40358 TargetLowering::DAGCombinerInfo &DCI,
40359 const X86Subtarget &Subtarget) {
40360 unsigned opcode;
40361
40362 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
40363 // we're requiring SSE2 for both.
40364 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
40365 SDValue N0 = N->getOperand(0);
40366 SDValue N1 = N->getOperand(1);
40367 SDValue CMP0 = N0.getOperand(1);
40368 SDValue CMP1 = N1.getOperand(1);
40369 SDLoc DL(N);
40370
40371 // The SETCCs should both refer to the same CMP.
40372 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
40373 return SDValue();
40374
40375 SDValue CMP00 = CMP0->getOperand(0);
40376 SDValue CMP01 = CMP0->getOperand(1);
40377 EVT VT = CMP00.getValueType();
40378
40379 if (VT == MVT::f32 || VT == MVT::f64) {
40380 bool ExpectingFlags = false;
40381 // Check for any users that want flags:
40382 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
40383 !ExpectingFlags && UI != UE; ++UI)
40384 switch (UI->getOpcode()) {
40385 default:
40386 case ISD::BR_CC:
40387 case ISD::BRCOND:
40388 case ISD::SELECT:
40389 ExpectingFlags = true;
40390 break;
40391 case ISD::CopyToReg:
40392 case ISD::SIGN_EXTEND:
40393 case ISD::ZERO_EXTEND:
40394 case ISD::ANY_EXTEND:
40395 break;
40396 }
40397
40398 if (!ExpectingFlags) {
40399 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
40400 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
40401
40402 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
40403 X86::CondCode tmp = cc0;
40404 cc0 = cc1;
40405 cc1 = tmp;
40406 }
40407
40408 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
40409 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
40410 // FIXME: need symbolic constants for these magic numbers.
40411 // See X86ATTInstPrinter.cpp:printSSECC().
40412 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
40413 if (Subtarget.hasAVX512()) {
40414 SDValue FSetCC =
40415 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
40416 DAG.getTargetConstant(x86cc, DL, MVT::i8));
40417 // Need to fill with zeros to ensure the bitcast will produce zeroes
40418 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
40419 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
40420 DAG.getConstant(0, DL, MVT::v16i1),
40421 FSetCC, DAG.getIntPtrConstant(0, DL));
40422 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
40423 N->getSimpleValueType(0));
40424 }
40425 SDValue OnesOrZeroesF =
40426 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
40427 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
40428
40429 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
40430 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
40431
40432 if (is64BitFP && !Subtarget.is64Bit()) {
40433 // On a 32-bit target, we cannot bitcast the 64-bit float to a
40434 // 64-bit integer, since that's not a legal type. Since
40435 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
40436 // bits, but can do this little dance to extract the lowest 32 bits
40437 // and work with those going forward.
40438 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
40439 OnesOrZeroesF);
40440 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
40441 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
40442 Vector32, DAG.getIntPtrConstant(0, DL));
40443 IntVT = MVT::i32;
40444 }
40445
40446 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
40447 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
40448 DAG.getConstant(1, DL, IntVT));
40449 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
40450 ANDed);
40451 return OneBitOfTruth;
40452 }
40453 }
40454 }
40455 }
40456 return SDValue();
40457}
40458
40459/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
40460static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
40461 assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0
) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40461, __PRETTY_FUNCTION__))
;
40462
40463 MVT VT = N->getSimpleValueType(0);
40464 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
40465 return SDValue();
40466
40467 SDValue X, Y;
40468 SDValue N0 = N->getOperand(0);
40469 SDValue N1 = N->getOperand(1);
40470
40471 if (SDValue Not = IsNOT(N0, DAG)) {
40472 X = Not;
40473 Y = N1;
40474 } else if (SDValue Not = IsNOT(N1, DAG)) {
40475 X = Not;
40476 Y = N0;
40477 } else
40478 return SDValue();
40479
40480 X = DAG.getBitcast(VT, X);
40481 Y = DAG.getBitcast(VT, Y);
40482 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
40483}
40484
40485// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
40486// logical operations, like in the example below.
40487// or (and (truncate x, truncate y)),
40488// (xor (truncate z, build_vector (constants)))
40489// Given a target type \p VT, we generate
40490// or (and x, y), (xor z, zext(build_vector (constants)))
40491// given x, y and z are of type \p VT. We can do so, if operands are either
40492// truncates from VT types, the second operand is a vector of constants or can
40493// be recursively promoted.
40494static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
40495 unsigned Depth) {
40496 // Limit recursion to avoid excessive compile times.
40497 if (Depth >= SelectionDAG::MaxRecursionDepth)
40498 return SDValue();
40499
40500 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
40501 N->getOpcode() != ISD::OR)
40502 return SDValue();
40503
40504 SDValue N0 = N->getOperand(0);
40505 SDValue N1 = N->getOperand(1);
40506 SDLoc DL(N);
40507
40508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40509 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
40510 return SDValue();
40511
40512 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
40513 N0 = NN0;
40514 else {
40515 // The Left side has to be a trunc.
40516 if (N0.getOpcode() != ISD::TRUNCATE)
40517 return SDValue();
40518
40519 // The type of the truncated inputs.
40520 if (N0.getOperand(0).getValueType() != VT)
40521 return SDValue();
40522
40523 N0 = N0.getOperand(0);
40524 }
40525
40526 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
40527 N1 = NN1;
40528 else {
40529 // The right side has to be a 'trunc' or a constant vector.
40530 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
40531 N1.getOperand(0).getValueType() == VT;
40532 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
40533 return SDValue();
40534
40535 if (RHSTrunc)
40536 N1 = N1.getOperand(0);
40537 else
40538 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
40539 }
40540
40541 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
40542}
40543
40544// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
40545// register. In most cases we actually compare or select YMM-sized registers
40546// and mixing the two types creates horrible code. This method optimizes
40547// some of the transition sequences.
40548// Even with AVX-512 this is still useful for removing casts around logical
40549// operations on vXi1 mask types.
40550static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
40551 const X86Subtarget &Subtarget) {
40552 EVT VT = N->getValueType(0);
40553 assert(VT.isVector() && "Expected vector type")((VT.isVector() && "Expected vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40553, __PRETTY_FUNCTION__))
;
40554
40555 SDLoc DL(N);
40556 assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40558, __PRETTY_FUNCTION__))
40557 N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40558, __PRETTY_FUNCTION__))
40558 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40558, __PRETTY_FUNCTION__))
;
40559
40560 SDValue Narrow = N->getOperand(0);
40561 EVT NarrowVT = Narrow.getValueType();
40562
40563 // Generate the wide operation.
40564 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
40565 if (!Op)
40566 return SDValue();
40567 switch (N->getOpcode()) {
40568 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40568)
;
40569 case ISD::ANY_EXTEND:
40570 return Op;
40571 case ISD::ZERO_EXTEND:
40572 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
40573 case ISD::SIGN_EXTEND:
40574 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
40575 Op, DAG.getValueType(NarrowVT));
40576 }
40577}
40578
40579/// If both input operands of a logic op are being cast from floating point
40580/// types, try to convert this into a floating point logic node to avoid
40581/// unnecessary moves from SSE to integer registers.
40582static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
40583 const X86Subtarget &Subtarget) {
40584 EVT VT = N->getValueType(0);
40585 SDValue N0 = N->getOperand(0);
40586 SDValue N1 = N->getOperand(1);
40587 SDLoc DL(N);
40588
40589 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
40590 return SDValue();
40591
40592 SDValue N00 = N0.getOperand(0);
40593 SDValue N10 = N1.getOperand(0);
40594 EVT N00Type = N00.getValueType();
40595 EVT N10Type = N10.getValueType();
40596
40597 // Ensure that both types are the same and are legal scalar fp types.
40598 if (N00Type != N10Type ||
40599 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
40600 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
40601 return SDValue();
40602
40603 unsigned FPOpcode;
40604 switch (N->getOpcode()) {
40605 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40605)
;
40606 case ISD::AND: FPOpcode = X86ISD::FAND; break;
40607 case ISD::OR: FPOpcode = X86ISD::FOR; break;
40608 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40609 }
40610
40611 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
40612 return DAG.getBitcast(VT, FPLogic);
40613}
40614
40615/// If this is a zero/all-bits result that is bitwise-anded with a low bits
40616/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
40617/// with a shift-right to eliminate loading the vector constant mask value.
40618static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
40619 const X86Subtarget &Subtarget) {
40620 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
40621 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
40622 EVT VT0 = Op0.getValueType();
40623 EVT VT1 = Op1.getValueType();
40624
40625 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
40626 return SDValue();
40627
40628 APInt SplatVal;
40629 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
40630 !SplatVal.isMask())
40631 return SDValue();
40632
40633 // Don't prevent creation of ANDN.
40634 if (isBitwiseNot(Op0))
40635 return SDValue();
40636
40637 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
40638 return SDValue();
40639
40640 unsigned EltBitWidth = VT0.getScalarSizeInBits();
40641 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
40642 return SDValue();
40643
40644 SDLoc DL(N);
40645 unsigned ShiftVal = SplatVal.countTrailingOnes();
40646 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
40647 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
40648 return DAG.getBitcast(N->getValueType(0), Shift);
40649}
40650
40651// Get the index node from the lowered DAG of a GEP IR instruction with one
40652// indexing dimension.
40653static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
40654 if (Ld->isIndexed())
40655 return SDValue();
40656
40657 SDValue Base = Ld->getBasePtr();
40658
40659 if (Base.getOpcode() != ISD::ADD)
40660 return SDValue();
40661
40662 SDValue ShiftedIndex = Base.getOperand(0);
40663
40664 if (ShiftedIndex.getOpcode() != ISD::SHL)
40665 return SDValue();
40666
40667 return ShiftedIndex.getOperand(0);
40668
40669}
40670
40671static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
40672 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
40673 switch (VT.getSizeInBits()) {
40674 default: return false;
40675 case 64: return Subtarget.is64Bit() ? true : false;
40676 case 32: return true;
40677 }
40678 }
40679 return false;
40680}
40681
40682// This function recognizes cases where X86 bzhi instruction can replace and
40683// 'and-load' sequence.
40684// In case of loading integer value from an array of constants which is defined
40685// as follows:
40686//
40687// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
40688//
40689// then applying a bitwise and on the result with another input.
40690// It's equivalent to performing bzhi (zero high bits) on the input, with the
40691// same index of the load.
40692static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
40693 const X86Subtarget &Subtarget) {
40694 MVT VT = Node->getSimpleValueType(0);
40695 SDLoc dl(Node);
40696
40697 // Check if subtarget has BZHI instruction for the node's type
40698 if (!hasBZHI(Subtarget, VT))
40699 return SDValue();
40700
40701 // Try matching the pattern for both operands.
40702 for (unsigned i = 0; i < 2; i++) {
40703 SDValue N = Node->getOperand(i);
40704 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
40705
40706 // continue if the operand is not a load instruction
40707 if (!Ld)
40708 return SDValue();
40709
40710 const Value *MemOp = Ld->getMemOperand()->getValue();
40711
40712 if (!MemOp)
40713 return SDValue();
40714
40715 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
40716 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
40717 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
40718
40719 Constant *Init = GV->getInitializer();
40720 Type *Ty = Init->getType();
40721 if (!isa<ConstantDataArray>(Init) ||
40722 !Ty->getArrayElementType()->isIntegerTy() ||
40723 Ty->getArrayElementType()->getScalarSizeInBits() !=
40724 VT.getSizeInBits() ||
40725 Ty->getArrayNumElements() >
40726 Ty->getArrayElementType()->getScalarSizeInBits())
40727 continue;
40728
40729 // Check if the array's constant elements are suitable to our case.
40730 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
40731 bool ConstantsMatch = true;
40732 for (uint64_t j = 0; j < ArrayElementCount; j++) {
40733 ConstantInt *Elem =
40734 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
40735 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
40736 ConstantsMatch = false;
40737 break;
40738 }
40739 }
40740 if (!ConstantsMatch)
40741 continue;
40742
40743 // Do the transformation (For 32-bit type):
40744 // -> (and (load arr[idx]), inp)
40745 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
40746 // that will be replaced with one bzhi instruction.
40747 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
40748 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
40749
40750 // Get the Node which indexes into the array.
40751 SDValue Index = getIndexFromUnindexedLoad(Ld);
40752 if (!Index)
40753 return SDValue();
40754 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
40755
40756 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
40757 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
40758
40759 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
40760 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
40761
40762 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
40763 }
40764 }
40765 }
40766 }
40767 return SDValue();
40768}
40769
40770// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
40771// Turn it into series of XORs and a setnp.
40772static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
40773 const X86Subtarget &Subtarget) {
40774 EVT VT = N->getValueType(0);
40775
40776 // We only support 64-bit and 32-bit. 64-bit requires special handling
40777 // unless the 64-bit popcnt instruction is legal.
40778 if (VT != MVT::i32 && VT != MVT::i64)
40779 return SDValue();
40780
40781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40782 if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
40783 return SDValue();
40784
40785 SDValue N0 = N->getOperand(0);
40786 SDValue N1 = N->getOperand(1);
40787
40788 // LHS needs to be a single use CTPOP.
40789 if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
40790 return SDValue();
40791
40792 // RHS needs to be 1.
40793 if (!isOneConstant(N1))
40794 return SDValue();
40795
40796 SDLoc DL(N);
40797 SDValue X = N0.getOperand(0);
40798
40799 // If this is 64-bit, its always best to xor the two 32-bit pieces together
40800 // even if we have popcnt.
40801 if (VT == MVT::i64) {
40802 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
40803 DAG.getNode(ISD::SRL, DL, VT, X,
40804 DAG.getConstant(32, DL, MVT::i8)));
40805 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
40806 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
40807 // Generate a 32-bit parity idiom. This will bring us back here if we need
40808 // to expand it too.
40809 SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
40810 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
40811 DAG.getConstant(1, DL, MVT::i32));
40812 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
40813 }
40814 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40814, __PRETTY_FUNCTION__))
;
40815
40816 // Xor the high and low 16-bits together using a 32-bit operation.
40817 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
40818 DAG.getConstant(16, DL, MVT::i8));
40819 X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
40820
40821 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
40822 // This should allow an h-reg to be used to save a shift.
40823 // FIXME: We only get an h-reg in 32-bit mode.
40824 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
40825 DAG.getNode(ISD::SRL, DL, VT, X,
40826 DAG.getConstant(8, DL, MVT::i8)));
40827 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
40828 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
40829 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
40830
40831 // Copy the inverse of the parity flag into a register with setcc.
40832 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
40833 // Zero extend to original type.
40834 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
40835}
40836
40837
40838// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
40839// Where C is a mask containing the same number of bits as the setcc and
40840// where the setcc will freely 0 upper bits of k-register. We can replace the
40841// undef in the concat with 0s and remove the AND. This mainly helps with
40842// v2i1/v4i1 setcc being casted to scalar.
40843static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
40844 const X86Subtarget &Subtarget) {
40845 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((N->getOpcode() == ISD::AND && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40845, __PRETTY_FUNCTION__))
;
40846
40847 EVT VT = N->getValueType(0);
40848
40849 // Make sure this is an AND with constant. We will check the value of the
40850 // constant later.
40851 if (!isa<ConstantSDNode>(N->getOperand(1)))
40852 return SDValue();
40853
40854 // This is implied by the ConstantSDNode.
40855 assert(!VT.isVector() && "Expected scalar VT!")((!VT.isVector() && "Expected scalar VT!") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40855, __PRETTY_FUNCTION__))
;
40856
40857 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
40858 !N->getOperand(0).hasOneUse() ||
40859 !N->getOperand(0).getOperand(0).hasOneUse())
40860 return SDValue();
40861
40862 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40863 SDValue Src = N->getOperand(0).getOperand(0);
40864 EVT SrcVT = Src.getValueType();
40865 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
40866 !TLI.isTypeLegal(SrcVT))
40867 return SDValue();
40868
40869 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
40870 return SDValue();
40871
40872 // We only care about the first subvector of the concat, we expect the
40873 // other subvectors to be ignored due to the AND if we make the change.
40874 SDValue SubVec = Src.getOperand(0);
40875 EVT SubVecVT = SubVec.getValueType();
40876
40877 // First subvector should be a setcc with a legal result type. The RHS of the
40878 // AND should be a mask with this many bits.
40879 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
40880 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
40881 return SDValue();
40882
40883 EVT SetccVT = SubVec.getOperand(0).getValueType();
40884 if (!TLI.isTypeLegal(SetccVT) ||
40885 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
40886 return SDValue();
40887
40888 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
40889 return SDValue();
40890
40891 // We passed all the checks. Rebuild the concat_vectors with zeroes
40892 // and cast it back to VT.
40893 SDLoc dl(N);
40894 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
40895 DAG.getConstant(0, dl, SubVecVT));
40896 Ops[0] = SubVec;
40897 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
40898 Ops);
40899 return DAG.getBitcast(VT, Concat);
40900}
40901
40902static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
40903 TargetLowering::DAGCombinerInfo &DCI,
40904 const X86Subtarget &Subtarget) {
40905 EVT VT = N->getValueType(0);
40906
40907 // If this is SSE1 only convert to FAND to avoid scalarization.
40908 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
40909 return DAG.getBitcast(
40910 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
40911 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
40912 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
40913 }
40914
40915 // Use a 32-bit and+zext if upper bits known zero.
40916 if (VT == MVT::i64 && Subtarget.is64Bit() &&
40917 !isa<ConstantSDNode>(N->getOperand(1))) {
40918 APInt HiMask = APInt::getHighBitsSet(64, 32);
40919 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
40920 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
40921 SDLoc dl(N);
40922 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
40923 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
40924 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
40925 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
40926 }
40927 }
40928
40929 // This must be done before legalization has expanded the ctpop.
40930 if (SDValue V = combineParity(N, DAG, Subtarget))
40931 return V;
40932
40933 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
40934 // TODO: Support multiple SrcOps.
40935 if (VT == MVT::i1) {
40936 SmallVector<SDValue, 2> SrcOps;
40937 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
40938 SrcOps.size() == 1) {
40939 SDLoc dl(N);
40940 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40941 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
40942 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40943 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
40944 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
40945 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
40946 if (Mask) {
40947 APInt AllBits = APInt::getAllOnesValue(NumElts);
40948 return DAG.getSetCC(dl, MVT::i1, Mask,
40949 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
40950 }
40951 }
40952 }
40953
40954 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
40955 return V;
40956
40957 if (DCI.isBeforeLegalizeOps())
40958 return SDValue();
40959
40960 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
40961 return R;
40962
40963 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
40964 return FPLogic;
40965
40966 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
40967 return R;
40968
40969 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
40970 return ShiftRight;
40971
40972 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
40973 return R;
40974
40975 // Attempt to recursively combine a bitmask AND with shuffles.
40976 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
40977 SDValue Op(N, 0);
40978 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40979 return Res;
40980 }
40981
40982 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
40983 if ((VT.getScalarSizeInBits() % 8) == 0 &&
40984 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40985 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
40986 SDValue BitMask = N->getOperand(1);
40987 SDValue SrcVec = N->getOperand(0).getOperand(0);
40988 EVT SrcVecVT = SrcVec.getValueType();
40989
40990 // Check that the constant bitmask masks whole bytes.
40991 APInt UndefElts;
40992 SmallVector<APInt, 64> EltBits;
40993 if (VT == SrcVecVT.getScalarType() &&
40994 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
40995 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
40996 llvm::all_of(EltBits, [](APInt M) {
40997 return M.isNullValue() || M.isAllOnesValue();
40998 })) {
40999 unsigned NumElts = SrcVecVT.getVectorNumElements();
41000 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
41001 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
41002
41003 // Create a root shuffle mask from the byte mask and the extracted index.
41004 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
41005 for (unsigned i = 0; i != Scale; ++i) {
41006 if (UndefElts[i])
41007 continue;
41008 int VecIdx = Scale * Idx + i;
41009 ShuffleMask[VecIdx] =
41010 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
41011 }
41012
41013 if (SDValue Shuffle = combineX86ShufflesRecursively(
41014 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
41015 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
41016 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
41017 N->getOperand(0).getOperand(1));
41018 }
41019 }
41020
41021 return SDValue();
41022}
41023
41024// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
41025static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
41026 const X86Subtarget &Subtarget) {
41027 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41027, __PRETTY_FUNCTION__))
;
41028
41029 MVT VT = N->getSimpleValueType(0);
41030 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
41031 return SDValue();
41032
41033 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
41034 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
41035 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
41036 return SDValue();
41037
41038 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
41039 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
41040 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
41041 Subtarget.hasVLX();
41042 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
41043 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
41044 return SDValue();
41045
41046 // Attempt to extract constant byte masks.
41047 APInt UndefElts0, UndefElts1;
41048 SmallVector<APInt, 32> EltBits0, EltBits1;
41049 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
41050 false, false))
41051 return SDValue();
41052 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
41053 false, false))
41054 return SDValue();
41055
41056 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
41057 // TODO - add UNDEF elts support.
41058 if (UndefElts0[i] || UndefElts1[i])
41059 return SDValue();
41060 if (EltBits0[i] != ~EltBits1[i])
41061 return SDValue();
41062 }
41063
41064 SDLoc DL(N);
41065 SDValue X = N->getOperand(0);
41066 SDValue Y =
41067 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
41068 DAG.getBitcast(VT, N1.getOperand(0)));
41069 return DAG.getNode(ISD::OR, DL, VT, X, Y);
41070}
41071
41072// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
41073static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
41074 if (N->getOpcode() != ISD::OR)
41075 return false;
41076
41077 SDValue N0 = N->getOperand(0);
41078 SDValue N1 = N->getOperand(1);
41079
41080 // Canonicalize AND to LHS.
41081 if (N1.getOpcode() == ISD::AND)
41082 std::swap(N0, N1);
41083
41084 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
41085 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
41086 return false;
41087
41088 Mask = N1.getOperand(0);
41089 X = N1.getOperand(1);
41090
41091 // Check to see if the mask appeared in both the AND and ANDNP.
41092 if (N0.getOperand(0) == Mask)
41093 Y = N0.getOperand(1);
41094 else if (N0.getOperand(1) == Mask)
41095 Y = N0.getOperand(0);
41096 else
41097 return false;
41098
41099 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
41100 // ANDNP combine allows other combines to happen that prevent matching.
41101 return true;
41102}
41103
41104// Try to fold:
41105// (or (and (m, y), (pandn m, x)))
41106// into:
41107// (vselect m, x, y)
41108// As a special case, try to fold:
41109// (or (and (m, (sub 0, x)), (pandn m, x)))
41110// into:
41111// (sub (xor X, M), M)
41112static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
41113 const X86Subtarget &Subtarget) {
41114 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41114, __PRETTY_FUNCTION__))
;
41115
41116 EVT VT = N->getValueType(0);
41117 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
41118 (VT.is256BitVector() && Subtarget.hasInt256())))
41119 return SDValue();
41120
41121 SDValue X, Y, Mask;
41122 if (!matchLogicBlend(N, X, Y, Mask))
41123 return SDValue();
41124
41125 // Validate that X, Y, and Mask are bitcasts, and see through them.
41126 Mask = peekThroughBitcasts(Mask);
41127 X = peekThroughBitcasts(X);
41128 Y = peekThroughBitcasts(Y);
41129
41130 EVT MaskVT = Mask.getValueType();
41131 unsigned EltBits = MaskVT.getScalarSizeInBits();
41132
41133 // TODO: Attempt to handle floating point cases as well?
41134 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
41135 return SDValue();
41136
41137 SDLoc DL(N);
41138
41139 // Attempt to combine to conditional negate: (sub (xor X, M), M)
41140 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
41141 DAG, Subtarget))
41142 return Res;
41143
41144 // PBLENDVB is only available on SSE 4.1.
41145 if (!Subtarget.hasSSE41())
41146 return SDValue();
41147
41148 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
41149
41150 X = DAG.getBitcast(BlendVT, X);
41151 Y = DAG.getBitcast(BlendVT, Y);
41152 Mask = DAG.getBitcast(BlendVT, Mask);
41153 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
41154 return DAG.getBitcast(VT, Mask);
41155}
41156
41157// Helper function for combineOrCmpEqZeroToCtlzSrl
41158// Transforms:
41159// seteq(cmp x, 0)
41160// into:
41161// srl(ctlz x), log2(bitsize(x))
41162// Input pattern is checked by caller.
41163static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
41164 SelectionDAG &DAG) {
41165 SDValue Cmp = Op.getOperand(1);
41166 EVT VT = Cmp.getOperand(0).getValueType();
41167 unsigned Log2b = Log2_32(VT.getSizeInBits());
41168 SDLoc dl(Op);
41169 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
41170 // The result of the shift is true or false, and on X86, the 32-bit
41171 // encoding of shr and lzcnt is more desirable.
41172 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
41173 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
41174 DAG.getConstant(Log2b, dl, MVT::i8));
41175 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
41176}
41177
41178// Try to transform:
41179// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
41180// into:
41181// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
41182// Will also attempt to match more generic cases, eg:
41183// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
41184// Only applies if the target supports the FastLZCNT feature.
41185static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
41186 TargetLowering::DAGCombinerInfo &DCI,
41187 const X86Subtarget &Subtarget) {
41188 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
41189 return SDValue();
41190
41191 auto isORCandidate = [](SDValue N) {
41192 return (N->getOpcode() == ISD::OR && N->hasOneUse());
41193 };
41194
41195 // Check the zero extend is extending to 32-bit or more. The code generated by
41196 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
41197 // instructions to clear the upper bits.
41198 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
41199 !isORCandidate(N->getOperand(0)))
41200 return SDValue();
41201
41202 // Check the node matches: setcc(eq, cmp 0)
41203 auto isSetCCCandidate = [](SDValue N) {
41204 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
41205 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
41206 N->getOperand(1).getOpcode() == X86ISD::CMP &&
41207 isNullConstant(N->getOperand(1).getOperand(1)) &&
41208 N->getOperand(1).getValueType().bitsGE(MVT::i32);
41209 };
41210
41211 SDNode *OR = N->getOperand(0).getNode();
41212 SDValue LHS = OR->getOperand(0);
41213 SDValue RHS = OR->getOperand(1);
41214
41215 // Save nodes matching or(or, setcc(eq, cmp 0)).
41216 SmallVector<SDNode *, 2> ORNodes;
41217 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
41218 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
41219 ORNodes.push_back(OR);
41220 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
41221 LHS = OR->getOperand(0);
41222 RHS = OR->getOperand(1);
41223 }
41224
41225 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
41226 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
41227 !isORCandidate(SDValue(OR, 0)))
41228 return SDValue();
41229
41230 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
41231 // to
41232 // or(srl(ctlz),srl(ctlz)).
41233 // The dag combiner can then fold it into:
41234 // srl(or(ctlz, ctlz)).
41235 EVT VT = OR->getValueType(0);
41236 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
41237 SDValue Ret, NewRHS;
41238 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
41239 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
41240
41241 if (!Ret)
41242 return SDValue();
41243
41244 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
41245 while (ORNodes.size() > 0) {
41246 OR = ORNodes.pop_back_val();
41247 LHS = OR->getOperand(0);
41248 RHS = OR->getOperand(1);
41249 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
41250 if (RHS->getOpcode() == ISD::OR)
41251 std::swap(LHS, RHS);
41252 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
41253 if (!NewRHS)
41254 return SDValue();
41255 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
41256 }
41257
41258 if (Ret)
41259 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
41260
41261 return Ret;
41262}
41263
41264static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
41265 const X86Subtarget &Subtarget) {
41266 assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node")((N->getOpcode() == ISD::OR && "Expected ISD::OR node"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Expected ISD::OR node\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41266, __PRETTY_FUNCTION__))
;
41267 SDValue N0 = N->getOperand(0);
41268 SDValue N1 = N->getOperand(1);
41269 EVT VT = N->getValueType(0);
41270 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41271
41272 if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
41273 !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
41274 return SDValue();
41275
41276 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
41277 bool OptForSize = DAG.shouldOptForSize();
41278 unsigned Bits = VT.getScalarSizeInBits();
41279
41280 // SHLD/SHRD instructions have lower register pressure, but on some
41281 // platforms they have higher latency than the equivalent
41282 // series of shifts/or that would otherwise be generated.
41283 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
41284 // have higher latencies and we are not optimizing for size.
41285 if (!OptForSize && Subtarget.isSHLDSlow())
41286 return SDValue();
41287
41288 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
41289 std::swap(N0, N1);
41290 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
41291 return SDValue();
41292 if (!N0.hasOneUse() || !N1.hasOneUse())
41293 return SDValue();
41294
41295 EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
41296
41297 SDValue ShAmt0 = N0.getOperand(1);
41298 if (ShAmt0.getValueType() != ShiftVT)
41299 return SDValue();
41300 SDValue ShAmt1 = N1.getOperand(1);
41301 if (ShAmt1.getValueType() != ShiftVT)
41302 return SDValue();
41303
41304 // Peek through any modulo shift masks.
41305 SDValue ShMsk0;
41306 if (ShAmt0.getOpcode() == ISD::AND &&
41307 isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
41308 ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
41309 ShMsk0 = ShAmt0;
41310 ShAmt0 = ShAmt0.getOperand(0);
41311 }
41312 SDValue ShMsk1;
41313 if (ShAmt1.getOpcode() == ISD::AND &&
41314 isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
41315 ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
41316 ShMsk1 = ShAmt1;
41317 ShAmt1 = ShAmt1.getOperand(0);
41318 }
41319
41320 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
41321 ShAmt0 = ShAmt0.getOperand(0);
41322 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
41323 ShAmt1 = ShAmt1.getOperand(0);
41324
41325 SDLoc DL(N);
41326 unsigned Opc = ISD::FSHL;
41327 SDValue Op0 = N0.getOperand(0);
41328 SDValue Op1 = N1.getOperand(0);
41329 if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
41330 Opc = ISD::FSHR;
41331 std::swap(Op0, Op1);
41332 std::swap(ShAmt0, ShAmt1);
41333 std::swap(ShMsk0, ShMsk1);
41334 }
41335
41336 auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
41337 SDValue Amt) {
41338 if (Opc == ISD::FSHR)
41339 std::swap(Op0, Op1);
41340 return DAG.getNode(Opc, DL, VT, Op0, Op1,
41341 DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
41342 };
41343
41344 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
41345 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
41346 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
41347 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
41348 // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
41349 // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
41350 if (ShAmt1.getOpcode() == ISD::SUB) {
41351 SDValue Sum = ShAmt1.getOperand(0);
41352 if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
41353 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
41354 if (ShAmt1Op1.getOpcode() == ISD::AND &&
41355 isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
41356 ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
41357 ShMsk1 = ShAmt1Op1;
41358 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
41359 }
41360 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
41361 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
41362 if ((SumC->getAPIntValue() == Bits ||
41363 (SumC->getAPIntValue() == 0 && ShMsk1)) &&
41364 ShAmt1Op1 == ShAmt0)
41365 return GetFunnelShift(Op0, Op1, ShAmt0);
41366 }
41367 } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
41368 auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
41369 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
41370 return GetFunnelShift(Op0, Op1, ShAmt0);
41371 } else if (ShAmt1.getOpcode() == ISD::XOR) {
41372 SDValue Mask = ShAmt1.getOperand(1);
41373 if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
41374 unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
41375 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
41376 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
41377 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
41378 if (MaskC->getSExtValue() == (Bits - 1) &&
41379 (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
41380 if (Op1.getOpcode() == InnerShift &&
41381 isa<ConstantSDNode>(Op1.getOperand(1)) &&
41382 Op1.getConstantOperandAPInt(1).isOneValue()) {
41383 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
41384 }
41385 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
41386 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
41387 Op1.getOperand(0) == Op1.getOperand(1)) {
41388 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
41389 }
41390 }
41391 }
41392 }
41393
41394 return SDValue();
41395}
41396
41397static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
41398 TargetLowering::DAGCombinerInfo &DCI,
41399 const X86Subtarget &Subtarget) {
41400 SDValue N0 = N->getOperand(0);
41401 SDValue N1 = N->getOperand(1);
41402 EVT VT = N->getValueType(0);
41403
41404 // If this is SSE1 only convert to FOR to avoid scalarization.
41405 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
41406 return DAG.getBitcast(MVT::v4i32,
41407 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
41408 DAG.getBitcast(MVT::v4f32, N0),
41409 DAG.getBitcast(MVT::v4f32, N1)));
41410 }
41411
41412 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
41413 // TODO: Support multiple SrcOps.
41414 if (VT == MVT::i1) {
41415 SmallVector<SDValue, 2> SrcOps;
41416 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
41417 SrcOps.size() == 1) {
41418 SDLoc dl(N);
41419 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41420 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
41421 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41422 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
41423 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
41424 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
41425 if (Mask) {
41426 APInt AllBits = APInt::getNullValue(NumElts);
41427 return DAG.getSetCC(dl, MVT::i1, Mask,
41428 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
41429 }
41430 }
41431 }
41432
41433 if (DCI.isBeforeLegalizeOps())
41434 return SDValue();
41435
41436 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
41437 return R;
41438
41439 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
41440 return FPLogic;
41441
41442 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
41443 return R;
41444
41445 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
41446 return R;
41447
41448 if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
41449 return R;
41450
41451 // Attempt to recursively combine an OR of shuffles.
41452 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
41453 SDValue Op(N, 0);
41454 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41455 return Res;
41456 }
41457
41458 return SDValue();
41459}
41460
41461/// Try to turn tests against the signbit in the form of:
41462/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
41463/// into:
41464/// SETGT(X, -1)
41465static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
41466 // This is only worth doing if the output type is i8 or i1.
41467 EVT ResultType = N->getValueType(0);
41468 if (ResultType != MVT::i8 && ResultType != MVT::i1)
41469 return SDValue();
41470
41471 SDValue N0 = N->getOperand(0);
41472 SDValue N1 = N->getOperand(1);
41473
41474 // We should be performing an xor against a truncated shift.
41475 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
41476 return SDValue();
41477
41478 // Make sure we are performing an xor against one.
41479 if (!isOneConstant(N1))
41480 return SDValue();
41481
41482 // SetCC on x86 zero extends so only act on this if it's a logical shift.
41483 SDValue Shift = N0.getOperand(0);
41484 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
41485 return SDValue();
41486
41487 // Make sure we are truncating from one of i16, i32 or i64.
41488 EVT ShiftTy = Shift.getValueType();
41489 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
41490 return SDValue();
41491
41492 // Make sure the shift amount extracts the sign bit.
41493 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
41494 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
41495 return SDValue();
41496
41497 // Create a greater-than comparison against -1.
41498 // N.B. Using SETGE against 0 works but we want a canonical looking
41499 // comparison, using SETGT matches up with what TranslateX86CC.
41500 SDLoc DL(N);
41501 SDValue ShiftOp = Shift.getOperand(0);
41502 EVT ShiftOpTy = ShiftOp.getValueType();
41503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41504 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
41505 *DAG.getContext(), ResultType);
41506 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
41507 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
41508 if (SetCCResultType != ResultType)
41509 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
41510 return Cond;
41511}
41512
41513/// Turn vector tests of the signbit in the form of:
41514/// xor (sra X, elt_size(X)-1), -1
41515/// into:
41516/// pcmpgt X, -1
41517///
41518/// This should be called before type legalization because the pattern may not
41519/// persist after that.
41520static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
41521 const X86Subtarget &Subtarget) {
41522 EVT VT = N->getValueType(0);
41523 if (!VT.isSimple())
41524 return SDValue();
41525
41526 switch (VT.getSimpleVT().SimpleTy) {
41527 default: return SDValue();
41528 case MVT::v16i8:
41529 case MVT::v8i16:
41530 case MVT::v4i32:
41531 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
41532 case MVT::v32i8:
41533 case MVT::v16i16:
41534 case MVT::v8i32:
41535 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
41536 }
41537
41538 // There must be a shift right algebraic before the xor, and the xor must be a
41539 // 'not' operation.
41540 SDValue Shift = N->getOperand(0);
41541 SDValue Ones = N->getOperand(1);
41542 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
41543 !ISD::isBuildVectorAllOnes(Ones.getNode()))
41544 return SDValue();
41545
41546 // The shift should be smearing the sign bit across each vector element.
41547 auto *ShiftAmt =
41548 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
41549 if (!ShiftAmt ||
41550 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
41551 return SDValue();
41552
41553 // Create a greater-than comparison against -1. We don't use the more obvious
41554 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
41555 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
41556}
41557
41558/// Detect patterns of truncation with unsigned saturation:
41559///
41560/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
41561/// Return the source value x to be truncated or SDValue() if the pattern was
41562/// not matched.
41563///
41564/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
41565/// where C1 >= 0 and C2 is unsigned max of destination type.
41566///
41567/// (truncate (smax (smin (x, C2), C1)) to dest_type)
41568/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
41569///
41570/// These two patterns are equivalent to:
41571/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
41572/// So return the smax(x, C1) value to be truncated or SDValue() if the
41573/// pattern was not matched.
41574static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
41575 const SDLoc &DL) {
41576 EVT InVT = In.getValueType();
41577
41578 // Saturation with truncation. We truncate from InVT to VT.
41579 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41580, __PRETTY_FUNCTION__))
41580 "Unexpected types for truncate operation")((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41580, __PRETTY_FUNCTION__))
;
41581
41582 // Match min/max and return limit value as a parameter.
41583 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
41584 if (V.getOpcode() == Opcode &&
41585 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
41586 return V.getOperand(0);
41587 return SDValue();
41588 };
41589
41590 APInt C1, C2;
41591 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
41592 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
41593 // the element size of the destination type.
41594 if (C2.isMask(VT.getScalarSizeInBits()))
41595 return UMin;
41596
41597 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
41598 if (MatchMinMax(SMin, ISD::SMAX, C1))
41599 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
41600 return SMin;
41601
41602 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
41603 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
41604 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
41605 C2.uge(C1)) {
41606 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
41607 }
41608
41609 return SDValue();
41610}
41611
41612/// Detect patterns of truncation with signed saturation:
41613/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
41614/// signed_max_of_dest_type)) to dest_type)
41615/// or:
41616/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
41617/// signed_min_of_dest_type)) to dest_type).
41618/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
41619/// Return the source value to be truncated or SDValue() if the pattern was not
41620/// matched.
41621static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
41622 unsigned NumDstBits = VT.getScalarSizeInBits();
41623 unsigned NumSrcBits = In.getScalarValueSizeInBits();
41624 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((NumSrcBits > NumDstBits && "Unexpected types for truncate operation"
) ? static_cast<void> (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41624, __PRETTY_FUNCTION__))
;
41625
41626 auto MatchMinMax = [](SDValue V, unsigned Opcode,
41627 const APInt &Limit) -> SDValue {
41628 APInt C;
41629 if (V.getOpcode() == Opcode &&
41630 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
41631 return V.getOperand(0);
41632 return SDValue();
41633 };
41634
41635 APInt SignedMax, SignedMin;
41636 if (MatchPackUS) {
41637 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
41638 SignedMin = APInt(NumSrcBits, 0);
41639 } else {
41640 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
41641 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
41642 }
41643
41644 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
41645 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
41646 return SMax;
41647
41648 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
41649 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
41650 return SMin;
41651
41652 return SDValue();
41653}
41654
41655static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
41656 SelectionDAG &DAG,
41657 const X86Subtarget &Subtarget) {
41658 if (!Subtarget.hasSSE2() || !VT.isVector())
41659 return SDValue();
41660
41661 EVT SVT = VT.getVectorElementType();
41662 EVT InVT = In.getValueType();
41663 EVT InSVT = InVT.getVectorElementType();
41664
41665 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
41666 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
41667 // and concatenate at the same time. Then we can use a final vpmovuswb to
41668 // clip to 0-255.
41669 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
41670 InVT == MVT::v16i32 && VT == MVT::v16i8) {
41671 if (auto USatVal = detectSSatPattern(In, VT, true)) {
41672 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
41673 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
41674 DL, DAG, Subtarget);
41675 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41675, __PRETTY_FUNCTION__))
;
41676 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
41677 }
41678 }
41679
41680 // vXi32 truncate instructions are available with AVX512F.
41681 // vXi16 truncate instructions are only available with AVX512BW.
41682 // For 256-bit or smaller vectors, we require VLX.
41683 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
41684 // If the result type is 256-bits or larger and we have disable 512-bit
41685 // registers, we should go ahead and use the pack instructions if possible.
41686 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
41687 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
41688 (InVT.getSizeInBits() > 128) &&
41689 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
41690 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
41691
41692 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
41693 VT.getSizeInBits() >= 64 &&
41694 (SVT == MVT::i8 || SVT == MVT::i16) &&
41695 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
41696 if (auto USatVal = detectSSatPattern(In, VT, true)) {
41697 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
41698 // Only do this when the result is at least 64 bits or we'll leaving
41699 // dangling PACKSSDW nodes.
41700 if (SVT == MVT::i8 && InSVT == MVT::i32) {
41701 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
41702 VT.getVectorNumElements());
41703 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
41704 DAG, Subtarget);
41705 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41705, __PRETTY_FUNCTION__))
;
41706 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
41707 Subtarget);
41708 assert(V && "Failed to pack!")((V && "Failed to pack!") ? static_cast<void> (
0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41708, __PRETTY_FUNCTION__))
;
41709 return V;
41710 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
41711 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
41712 Subtarget);
41713 }
41714 if (auto SSatVal = detectSSatPattern(In, VT))
41715 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
41716 Subtarget);
41717 }
41718
41719 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41720 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
41721 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
41722 unsigned TruncOpc = 0;
41723 SDValue SatVal;
41724 if (auto SSatVal = detectSSatPattern(In, VT)) {
41725 SatVal = SSatVal;
41726 TruncOpc = X86ISD::VTRUNCS;
41727 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
41728 SatVal = USatVal;
41729 TruncOpc = X86ISD::VTRUNCUS;
41730 }
41731 if (SatVal) {
41732 unsigned ResElts = VT.getVectorNumElements();
41733 // If the input type is less than 512 bits and we don't have VLX, we need
41734 // to widen to 512 bits.
41735 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
41736 unsigned NumConcats = 512 / InVT.getSizeInBits();
41737 ResElts *= NumConcats;
41738 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
41739 ConcatOps[0] = SatVal;
41740 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
41741 NumConcats * InVT.getVectorNumElements());
41742 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
41743 }
41744 // Widen the result if its narrower than 128 bits.
41745 if (ResElts * SVT.getSizeInBits() < 128)
41746 ResElts = 128 / SVT.getSizeInBits();
41747 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
41748 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
41749 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
41750 DAG.getIntPtrConstant(0, DL));
41751 }
41752 }
41753
41754 return SDValue();
41755}
41756
41757/// This function detects the AVG pattern between vectors of unsigned i8/i16,
41758/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
41759/// X86ISD::AVG instruction.
41760static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
41761 const X86Subtarget &Subtarget,
41762 const SDLoc &DL) {
41763 if (!VT.isVector())
41764 return SDValue();
41765 EVT InVT = In.getValueType();
41766 unsigned NumElems = VT.getVectorNumElements();
41767
41768 EVT ScalarVT = VT.getVectorElementType();
41769 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
41770 NumElems >= 2 && isPowerOf2_32(NumElems)))
41771 return SDValue();
41772
41773 // InScalarVT is the intermediate type in AVG pattern and it should be greater
41774 // than the original input type (i8/i16).
41775 EVT InScalarVT = InVT.getVectorElementType();
41776 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
41777 return SDValue();
41778
41779 if (!Subtarget.hasSSE2())
41780 return SDValue();
41781
41782 // Detect the following pattern:
41783 //
41784 // %1 = zext <N x i8> %a to <N x i32>
41785 // %2 = zext <N x i8> %b to <N x i32>
41786 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
41787 // %4 = add nuw nsw <N x i32> %3, %2
41788 // %5 = lshr <N x i32> %N, <i32 1 x N>
41789 // %6 = trunc <N x i32> %5 to <N x i8>
41790 //
41791 // In AVX512, the last instruction can also be a trunc store.
41792 if (In.getOpcode() != ISD::SRL)
41793 return SDValue();
41794
41795 // A lambda checking the given SDValue is a constant vector and each element
41796 // is in the range [Min, Max].
41797 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
41798 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
41799 if (!BV || !BV->isConstant())
41800 return false;
41801 for (SDValue Op : V->ops()) {
41802 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
41803 if (!C)
41804 return false;
41805 const APInt &Val = C->getAPIntValue();
41806 if (Val.ult(Min) || Val.ugt(Max))
41807 return false;
41808 }
41809 return true;
41810 };
41811
41812 // Check if each element of the vector is right-shifted by one.
41813 auto LHS = In.getOperand(0);
41814 auto RHS = In.getOperand(1);
41815 if (!IsConstVectorInRange(RHS, 1, 1))
41816 return SDValue();
41817 if (LHS.getOpcode() != ISD::ADD)
41818 return SDValue();
41819
41820 // Detect a pattern of a + b + 1 where the order doesn't matter.
41821 SDValue Operands[3];
41822 Operands[0] = LHS.getOperand(0);
41823 Operands[1] = LHS.getOperand(1);
41824
41825 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41826 ArrayRef<SDValue> Ops) {
41827 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
41828 };
41829
41830 // Take care of the case when one of the operands is a constant vector whose
41831 // element is in the range [1, 256].
41832 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
41833 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
41834 Operands[0].getOperand(0).getValueType() == VT) {
41835 // The pattern is detected. Subtract one from the constant vector, then
41836 // demote it and emit X86ISD::AVG instruction.
41837 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
41838 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
41839 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
41840 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
41841 { Operands[0].getOperand(0), Operands[1] },
41842 AVGBuilder);
41843 }
41844
41845 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
41846 // Match the or case only if its 'add-like' - can be replaced by an add.
41847 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
41848 if (ISD::ADD == V.getOpcode()) {
41849 Op0 = V.getOperand(0);
41850 Op1 = V.getOperand(1);
41851 return true;
41852 }
41853 if (ISD::ZERO_EXTEND != V.getOpcode())
41854 return false;
41855 V = V.getOperand(0);
41856 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
41857 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
41858 return false;
41859 Op0 = V.getOperand(0);
41860 Op1 = V.getOperand(1);
41861 return true;
41862 };
41863
41864 SDValue Op0, Op1;
41865 if (FindAddLike(Operands[0], Op0, Op1))
41866 std::swap(Operands[0], Operands[1]);
41867 else if (!FindAddLike(Operands[1], Op0, Op1))
41868 return SDValue();
41869 Operands[2] = Op0;
41870 Operands[1] = Op1;
41871
41872 // Now we have three operands of two additions. Check that one of them is a
41873 // constant vector with ones, and the other two can be promoted from i8/i16.
41874 for (int i = 0; i < 3; ++i) {
41875 if (!IsConstVectorInRange(Operands[i], 1, 1))
41876 continue;
41877 std::swap(Operands[i], Operands[2]);
41878
41879 // Check if Operands[0] and Operands[1] are results of type promotion.
41880 for (int j = 0; j < 2; ++j)
41881 if (Operands[j].getValueType() != VT) {
41882 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
41883 Operands[j].getOperand(0).getValueType() != VT)
41884 return SDValue();
41885 Operands[j] = Operands[j].getOperand(0);
41886 }
41887
41888 // The pattern is detected, emit X86ISD::AVG instruction(s).
41889 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
41890 AVGBuilder);
41891 }
41892
41893 return SDValue();
41894}
41895
41896static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
41897 TargetLowering::DAGCombinerInfo &DCI,
41898 const X86Subtarget &Subtarget) {
41899 LoadSDNode *Ld = cast<LoadSDNode>(N);
41900 EVT RegVT = Ld->getValueType(0);
41901 EVT MemVT = Ld->getMemoryVT();
41902 SDLoc dl(Ld);
41903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41904
41905 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
41906 // into two 16-byte operations. Also split non-temporal aligned loads on
41907 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
41908 ISD::LoadExtType Ext = Ld->getExtensionType();
41909 bool Fast;
41910 unsigned Alignment = Ld->getAlignment();
41911 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
41912 Ext == ISD::NON_EXTLOAD &&
41913 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
41914 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
41915 *Ld->getMemOperand(), &Fast) &&
41916 !Fast))) {
41917 unsigned NumElems = RegVT.getVectorNumElements();
41918 if (NumElems < 2)
41919 return SDValue();
41920
41921 unsigned HalfAlign = 16;
41922 SDValue Ptr1 = Ld->getBasePtr();
41923 SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
41924 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
41925 NumElems / 2);
41926 SDValue Load1 =
41927 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
41928 Alignment, Ld->getMemOperand()->getFlags());
41929 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
41930 Ld->getPointerInfo().getWithOffset(HalfAlign),
41931 MinAlign(Alignment, HalfAlign),
41932 Ld->getMemOperand()->getFlags());
41933 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
41934 Load1.getValue(1), Load2.getValue(1));
41935
41936 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
41937 return DCI.CombineTo(N, NewVec, TF, true);
41938 }
41939
41940 // Bool vector load - attempt to cast to an integer, as we have good
41941 // (vXiY *ext(vXi1 bitcast(iX))) handling.
41942 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
41943 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
41944 unsigned NumElts = RegVT.getVectorNumElements();
41945 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41946 if (TLI.isTypeLegal(IntVT)) {
41947 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
41948 Ld->getPointerInfo(), Alignment,
41949 Ld->getMemOperand()->getFlags());
41950 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
41951 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
41952 }
41953 }
41954
41955 return SDValue();
41956}
41957
41958/// If V is a build vector of boolean constants and exactly one of those
41959/// constants is true, return the operand index of that true element.
41960/// Otherwise, return -1.
41961static int getOneTrueElt(SDValue V) {
41962 // This needs to be a build vector of booleans.
41963 // TODO: Checking for the i1 type matches the IR definition for the mask,
41964 // but the mask check could be loosened to i8 or other types. That might
41965 // also require checking more than 'allOnesValue'; eg, the x86 HW
41966 // instructions only require that the MSB is set for each mask element.
41967 // The ISD::MSTORE comments/definition do not specify how the mask operand
41968 // is formatted.
41969 auto *BV = dyn_cast<BuildVectorSDNode>(V);
41970 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
41971 return -1;
41972
41973 int TrueIndex = -1;
41974 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
41975 for (unsigned i = 0; i < NumElts; ++i) {
41976 const SDValue &Op = BV->getOperand(i);
41977 if (Op.isUndef())
41978 continue;
41979 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
41980 if (!ConstNode)
41981 return -1;
41982 if (ConstNode->getAPIntValue().isAllOnesValue()) {
41983 // If we already found a one, this is too many.
41984 if (TrueIndex >= 0)
41985 return -1;
41986 TrueIndex = i;
41987 }
41988 }
41989 return TrueIndex;
41990}
41991
41992/// Given a masked memory load/store operation, return true if it has one mask
41993/// bit set. If it has one mask bit set, then also return the memory address of
41994/// the scalar element to load/store, the vector index to insert/extract that
41995/// scalar element, and the alignment for the scalar memory access.
41996static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
41997 SelectionDAG &DAG, SDValue &Addr,
41998 SDValue &Index, unsigned &Alignment) {
41999 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
42000 if (TrueMaskElt < 0)
42001 return false;
42002
42003 // Get the address of the one scalar element that is specified by the mask
42004 // using the appropriate offset from the base pointer.
42005 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
42006 Addr = MaskedOp->getBasePtr();
42007 if (TrueMaskElt != 0) {
42008 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
42009 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
42010 }
42011
42012 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
42013 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
42014 return true;
42015}
42016
42017/// If exactly one element of the mask is set for a non-extending masked load,
42018/// it is a scalar load and vector insert.
42019/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
42020/// mask have already been optimized in IR, so we don't bother with those here.
42021static SDValue
42022reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
42023 TargetLowering::DAGCombinerInfo &DCI) {
42024 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42024, __PRETTY_FUNCTION__))
;
42025 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
42026 // However, some target hooks may need to be added to know when the transform
42027 // is profitable. Endianness would also have to be considered.
42028
42029 SDValue Addr, VecIndex;
42030 unsigned Alignment;
42031 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
42032 return SDValue();
42033
42034 // Load the one scalar element that is specified by the mask using the
42035 // appropriate offset from the base pointer.
42036 SDLoc DL(ML);
42037 EVT VT = ML->getValueType(0);
42038 EVT EltVT = VT.getVectorElementType();
42039 SDValue Load =
42040 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
42041 Alignment, ML->getMemOperand()->getFlags());
42042
42043 // Insert the loaded element into the appropriate place in the vector.
42044 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
42045 ML->getPassThru(), Load, VecIndex);
42046 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
42047}
42048
42049static SDValue
42050combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
42051 TargetLowering::DAGCombinerInfo &DCI) {
42052 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42052, __PRETTY_FUNCTION__))
;
42053 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
42054 return SDValue();
42055
42056 SDLoc DL(ML);
42057 EVT VT = ML->getValueType(0);
42058
42059 // If we are loading the first and last elements of a vector, it is safe and
42060 // always faster to load the whole vector. Replace the masked load with a
42061 // vector load and select.
42062 unsigned NumElts = VT.getVectorNumElements();
42063 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
42064 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
42065 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
42066 if (LoadFirstElt && LoadLastElt) {
42067 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
42068 ML->getMemOperand());
42069 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
42070 ML->getPassThru());
42071 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
42072 }
42073
42074 // Convert a masked load with a constant mask into a masked load and a select.
42075 // This allows the select operation to use a faster kind of select instruction
42076 // (for example, vblendvps -> vblendps).
42077
42078 // Don't try this if the pass-through operand is already undefined. That would
42079 // cause an infinite loop because that's what we're about to create.
42080 if (ML->getPassThru().isUndef())
42081 return SDValue();
42082
42083 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
42084 return SDValue();
42085
42086 // The new masked load has an undef pass-through operand. The select uses the
42087 // original pass-through operand.
42088 SDValue NewML = DAG.getMaskedLoad(
42089 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
42090 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
42091 ML->getAddressingMode(), ML->getExtensionType());
42092 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
42093 ML->getPassThru());
42094
42095 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
42096}
42097
42098static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
42099 TargetLowering::DAGCombinerInfo &DCI,
42100 const X86Subtarget &Subtarget) {
42101 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
42102
42103 // TODO: Expanding load with constant mask may be optimized as well.
42104 if (Mld->isExpandingLoad())
42105 return SDValue();
42106
42107 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
42108 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
42109 return ScalarLoad;
42110 // TODO: Do some AVX512 subsets benefit from this transform?
42111 if (!Subtarget.hasAVX512())
42112 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
42113 return Blend;
42114 }
42115
42116 return SDValue();
42117}
42118
42119/// If exactly one element of the mask is set for a non-truncating masked store,
42120/// it is a vector extract and scalar store.
42121/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
42122/// mask have already been optimized in IR, so we don't bother with those here.
42123static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
42124 SelectionDAG &DAG) {
42125 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
42126 // However, some target hooks may need to be added to know when the transform
42127 // is profitable. Endianness would also have to be considered.
42128
42129 SDValue Addr, VecIndex;
42130 unsigned Alignment;
42131 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
42132 return SDValue();
42133
42134 // Extract the one scalar element that is actually being stored.
42135 SDLoc DL(MS);
42136 EVT VT = MS->getValue().getValueType();
42137 EVT EltVT = VT.getVectorElementType();
42138 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
42139 MS->getValue(), VecIndex);
42140
42141 // Store that element at the appropriate offset from the base pointer.
42142 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
42143 Alignment, MS->getMemOperand()->getFlags());
42144}
42145
42146static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
42147 TargetLowering::DAGCombinerInfo &DCI,
42148 const X86Subtarget &Subtarget) {
42149 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
42150 if (Mst->isCompressingStore())
42151 return SDValue();
42152
42153 EVT VT = Mst->getValue().getValueType();
42154 SDLoc dl(Mst);
42155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42156
42157 if (Mst->isTruncatingStore())
42158 return SDValue();
42159
42160 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
42161 return ScalarStore;
42162
42163 // If the mask value has been legalized to a non-boolean vector, try to
42164 // simplify ops leading up to it. We only demand the MSB of each lane.
42165 SDValue Mask = Mst->getMask();
42166 if (Mask.getScalarValueSizeInBits() != 1) {
42167 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
42168 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
42169 DCI.AddToWorklist(N);
42170 return SDValue(N, 0);
42171 }
42172 if (SDValue NewMask =
42173 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
42174 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
42175 Mst->getBasePtr(), Mst->getOffset(), NewMask,
42176 Mst->getMemoryVT(), Mst->getMemOperand(),
42177 Mst->getAddressingMode());
42178 }
42179
42180 SDValue Value = Mst->getValue();
42181 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
42182 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
42183 Mst->getMemoryVT())) {
42184 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
42185 Mst->getBasePtr(), Mst->getOffset(), Mask,
42186 Mst->getMemoryVT(), Mst->getMemOperand(),
42187 Mst->getAddressingMode(), true);
42188 }
42189
42190 return SDValue();
42191}
42192
42193static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
42194 TargetLowering::DAGCombinerInfo &DCI,
42195 const X86Subtarget &Subtarget) {
42196 StoreSDNode *St = cast<StoreSDNode>(N);
42197 EVT StVT = St->getMemoryVT();
42198 SDLoc dl(St);
42199 unsigned Alignment = St->getAlignment();
42200 SDValue StoredVal = St->getValue();
42201 EVT VT = StoredVal.getValueType();
42202 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42203
42204 // Convert a store of vXi1 into a store of iX and a bitcast.
42205 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
42206 VT.getVectorElementType() == MVT::i1) {
42207
42208 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42209 StoredVal = DAG.getBitcast(NewVT, StoredVal);
42210
42211 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42212 St->getPointerInfo(), St->getAlignment(),
42213 St->getMemOperand()->getFlags());
42214 }
42215
42216 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
42217 // This will avoid a copy to k-register.
42218 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
42219 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42220 StoredVal.getOperand(0).getValueType() == MVT::i8) {
42221 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
42222 St->getBasePtr(), St->getPointerInfo(),
42223 St->getAlignment(), St->getMemOperand()->getFlags());
42224 }
42225
42226 // Widen v2i1/v4i1 stores to v8i1.
42227 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
42228 Subtarget.hasAVX512()) {
42229 unsigned NumConcats = 8 / VT.getVectorNumElements();
42230 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
42231 Ops[0] = StoredVal;
42232 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42233 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42234 St->getPointerInfo(), St->getAlignment(),
42235 St->getMemOperand()->getFlags());
42236 }
42237
42238 // Turn vXi1 stores of constants into a scalar store.
42239 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
42240 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
42241 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
42242 // If its a v64i1 store without 64-bit support, we need two stores.
42243 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
42244 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
42245 StoredVal->ops().slice(0, 32));
42246 Lo = combinevXi1ConstantToInteger(Lo, DAG);
42247 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
42248 StoredVal->ops().slice(32, 32));
42249 Hi = combinevXi1ConstantToInteger(Hi, DAG);
42250
42251 SDValue Ptr0 = St->getBasePtr();
42252 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
42253
42254 SDValue Ch0 =
42255 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
42256 Alignment, St->getMemOperand()->getFlags());
42257 SDValue Ch1 =
42258 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
42259 St->getPointerInfo().getWithOffset(4),
42260 MinAlign(Alignment, 4U),
42261 St->getMemOperand()->getFlags());
42262 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
42263 }
42264
42265 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
42266 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42267 St->getPointerInfo(), St->getAlignment(),
42268 St->getMemOperand()->getFlags());
42269 }
42270
42271 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
42272 // Sandy Bridge, perform two 16-byte stores.
42273 bool Fast;
42274 if (VT.is256BitVector() && StVT == VT &&
42275 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
42276 *St->getMemOperand(), &Fast) &&
42277 !Fast) {
42278 unsigned NumElems = VT.getVectorNumElements();
42279 if (NumElems < 2)
42280 return SDValue();
42281
42282 return splitVectorStore(St, DAG);
42283 }
42284
42285 // Split under-aligned vector non-temporal stores.
42286 if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
42287 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
42288 // vectors or the legalizer can scalarize it to use MOVNTI.
42289 if (VT.is256BitVector() || VT.is512BitVector()) {
42290 unsigned NumElems = VT.getVectorNumElements();
42291 if (NumElems < 2)
42292 return SDValue();
42293 return splitVectorStore(St, DAG);
42294 }
42295
42296 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
42297 // to use MOVNTI.
42298 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
42299 MVT NTVT = Subtarget.hasSSE4A()
42300 ? MVT::v2f64
42301 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
42302 return scalarizeVectorStore(St, NTVT, DAG);
42303 }
42304 }
42305
42306 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
42307 // supported, but avx512f is by extending to v16i32 and truncating.
42308 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
42309 St->getValue().getOpcode() == ISD::TRUNCATE &&
42310 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
42311 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
42312 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
42313 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
42314 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
42315 MVT::v16i8, St->getMemOperand());
42316 }
42317
42318 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
42319 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
42320 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
42321 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
42322 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
42323 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
42324 return EmitTruncSStore(IsSigned, St->getChain(),
42325 dl, StoredVal.getOperand(0), St->getBasePtr(),
42326 VT, St->getMemOperand(), DAG);
42327 }
42328
42329 // Optimize trunc store (of multiple scalars) to shuffle and store.
42330 // First, pack all of the elements in one place. Next, store to memory
42331 // in fewer chunks.
42332 if (St->isTruncatingStore() && VT.isVector()) {
42333 // Check if we can detect an AVG pattern from the truncation. If yes,
42334 // replace the trunc store by a normal store with the result of X86ISD::AVG
42335 // instruction.
42336 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
42337 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
42338 Subtarget, dl))
42339 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
42340 St->getPointerInfo(), St->getAlignment(),
42341 St->getMemOperand()->getFlags());
42342
42343 if (TLI.isTruncStoreLegal(VT, StVT)) {
42344 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
42345 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
42346 dl, Val, St->getBasePtr(),
42347 St->getMemoryVT(), St->getMemOperand(), DAG);
42348 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
42349 DAG, dl))
42350 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
42351 dl, Val, St->getBasePtr(),
42352 St->getMemoryVT(), St->getMemOperand(), DAG);
42353 }
42354
42355 return SDValue();
42356 }
42357
42358 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
42359 // the FP state in cases where an emms may be missing.
42360 // A preferable solution to the general problem is to figure out the right
42361 // places to insert EMMS. This qualifies as a quick hack.
42362
42363 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
42364 if (VT.getSizeInBits() != 64)
42365 return SDValue();
42366
42367 const Function &F = DAG.getMachineFunction().getFunction();
42368 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
42369 bool F64IsLegal =
42370 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
42371 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
42372 isa<LoadSDNode>(St->getValue()) &&
42373 cast<LoadSDNode>(St->getValue())->isSimple() &&
42374 St->getChain().hasOneUse() && St->isSimple()) {
42375 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
42376
42377 if (!ISD::isNormalLoad(Ld))
42378 return SDValue();
42379
42380 // Avoid the transformation if there are multiple uses of the loaded value.
42381 if (!Ld->hasNUsesOfValue(1, 0))
42382 return SDValue();
42383
42384 SDLoc LdDL(Ld);
42385 SDLoc StDL(N);
42386 // Lower to a single movq load/store pair.
42387 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
42388 Ld->getBasePtr(), Ld->getMemOperand());
42389
42390 // Make sure new load is placed in same chain order.
42391 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
42392 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
42393 St->getMemOperand());
42394 }
42395
42396 // This is similar to the above case, but here we handle a scalar 64-bit
42397 // integer store that is extracted from a vector on a 32-bit target.
42398 // If we have SSE2, then we can treat it like a floating-point double
42399 // to get past legalization. The execution dependencies fixup pass will
42400 // choose the optimal machine instruction for the store if this really is
42401 // an integer or v2f32 rather than an f64.
42402 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
42403 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
42404 SDValue OldExtract = St->getOperand(1);
42405 SDValue ExtOp0 = OldExtract.getOperand(0);
42406 unsigned VecSize = ExtOp0.getValueSizeInBits();
42407 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
42408 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
42409 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
42410 BitCast, OldExtract.getOperand(1));
42411 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
42412 St->getPointerInfo(), St->getAlignment(),
42413 St->getMemOperand()->getFlags());
42414 }
42415
42416 return SDValue();
42417}
42418
42419static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
42420 TargetLowering::DAGCombinerInfo &DCI,
42421 const X86Subtarget &Subtarget) {
42422 auto *St = cast<MemIntrinsicSDNode>(N);
42423
42424 SDValue StoredVal = N->getOperand(1);
42425 MVT VT = StoredVal.getSimpleValueType();
42426 EVT MemVT = St->getMemoryVT();
42427
42428 // Figure out which elements we demand.
42429 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
42430 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
42431
42432 APInt KnownUndef, KnownZero;
42433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42434 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
42435 KnownZero, DCI)) {
42436 DCI.AddToWorklist(N);
42437 return SDValue(N, 0);
42438 }
42439
42440 return SDValue();
42441}
42442
42443/// Return 'true' if this vector operation is "horizontal"
42444/// and return the operands for the horizontal operation in LHS and RHS. A
42445/// horizontal operation performs the binary operation on successive elements
42446/// of its first operand, then on successive elements of its second operand,
42447/// returning the resulting values in a vector. For example, if
42448/// A = < float a0, float a1, float a2, float a3 >
42449/// and
42450/// B = < float b0, float b1, float b2, float b3 >
42451/// then the result of doing a horizontal operation on A and B is
42452/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
42453/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
42454/// A horizontal-op B, for some already available A and B, and if so then LHS is
42455/// set to A, RHS to B, and the routine returns 'true'.
42456static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
42457 const X86Subtarget &Subtarget,
42458 bool IsCommutative) {
42459 // If either operand is undef, bail out. The binop should be simplified.
42460 if (LHS.isUndef() || RHS.isUndef())
42461 return false;
42462
42463 // Look for the following pattern:
42464 // A = < float a0, float a1, float a2, float a3 >
42465 // B = < float b0, float b1, float b2, float b3 >
42466 // and
42467 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
42468 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
42469 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
42470 // which is A horizontal-op B.
42471
42472 MVT VT = LHS.getSimpleValueType();
42473 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42474, __PRETTY_FUNCTION__))
42474 "Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42474, __PRETTY_FUNCTION__))
;
42475 unsigned NumElts = VT.getVectorNumElements();
42476
42477 // TODO - can we make a general helper method that does all of this for us?
42478 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
42479 SmallVectorImpl<int> &ShuffleMask) {
42480 if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
42481 if (!Op.getOperand(0).isUndef())
42482 N0 = Op.getOperand(0);
42483 if (!Op.getOperand(1).isUndef())
42484 N1 = Op.getOperand(1);
42485 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
42486 ShuffleMask.append(Mask.begin(), Mask.end());
42487 return;
42488 }
42489 bool UseSubVector = false;
42490 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42491 Op.getOperand(0).getValueType().is256BitVector() &&
42492 llvm::isNullConstant(Op.getOperand(1))) {
42493 Op = Op.getOperand(0);
42494 UseSubVector = true;
42495 }
42496 bool IsUnary;
42497 SmallVector<SDValue, 2> SrcOps;
42498 SmallVector<int, 16> SrcShuffleMask;
42499 SDValue BC = peekThroughBitcasts(Op);
42500 if (isTargetShuffle(BC.getOpcode()) &&
42501 getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
42502 SrcOps, SrcShuffleMask, IsUnary)) {
42503 if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
42504 SrcOps.size() <= 2) {
42505 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
42506 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
42507 ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
42508 }
42509 if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
42510 SrcOps.size() == 1) {
42511 N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
42512 N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
42513 ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
42514 ShuffleMask.append(Mask.begin(), Mask.end());
42515 }
42516 }
42517 };
42518
42519 // View LHS in the form
42520 // LHS = VECTOR_SHUFFLE A, B, LMask
42521 // If LHS is not a shuffle, then pretend it is the identity shuffle:
42522 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
42523 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
42524 SDValue A, B;
42525 SmallVector<int, 16> LMask;
42526 GetShuffle(LHS, A, B, LMask);
42527
42528 // Likewise, view RHS in the form
42529 // RHS = VECTOR_SHUFFLE C, D, RMask
42530 SDValue C, D;
42531 SmallVector<int, 16> RMask;
42532 GetShuffle(RHS, C, D, RMask);
42533
42534 // At least one of the operands should be a vector shuffle.
42535 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
42536 if (NumShuffles == 0)
42537 return false;
42538
42539 if (LMask.empty()) {
42540 A = LHS;
42541 for (unsigned i = 0; i != NumElts; ++i)
42542 LMask.push_back(i);
42543 }
42544
42545 if (RMask.empty()) {
42546 C = RHS;
42547 for (unsigned i = 0; i != NumElts; ++i)
42548 RMask.push_back(i);
42549 }
42550
42551 // If A and B occur in reverse order in RHS, then canonicalize by commuting
42552 // RHS operands and shuffle mask.
42553 if (A != C) {
42554 std::swap(C, D);
42555 ShuffleVectorSDNode::commuteMask(RMask);
42556 }
42557 // Check that the shuffles are both shuffling the same vectors.
42558 if (!(A == C && B == D))
42559 return false;
42560
42561 // LHS and RHS are now:
42562 // LHS = shuffle A, B, LMask
42563 // RHS = shuffle A, B, RMask
42564 // Check that the masks correspond to performing a horizontal operation.
42565 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
42566 // so we just repeat the inner loop if this is a 256-bit op.
42567 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
42568 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
42569 assert((NumEltsPer128BitChunk % 2 == 0) &&(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42570, __PRETTY_FUNCTION__))
42570 "Vector type should have an even number of elements in each lane")(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42570, __PRETTY_FUNCTION__))
;
42571 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
42572 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
42573 // Ignore undefined components.
42574 int LIdx = LMask[i + j], RIdx = RMask[i + j];
42575 if (LIdx < 0 || RIdx < 0 ||
42576 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
42577 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
42578 continue;
42579
42580 // The low half of the 128-bit result must choose from A.
42581 // The high half of the 128-bit result must choose from B,
42582 // unless B is undef. In that case, we are always choosing from A.
42583 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
42584 unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
42585
42586 // Check that successive elements are being operated on. If not, this is
42587 // not a horizontal operation.
42588 int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
42589 if (!(LIdx == Index && RIdx == Index + 1) &&
42590 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
42591 return false;
42592 }
42593 }
42594
42595 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
42596 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
42597
42598 if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
42599 return false;
42600
42601 LHS = DAG.getBitcast(VT, LHS);
42602 RHS = DAG.getBitcast(VT, RHS);
42603 return true;
42604}
42605
42606/// Do target-specific dag combines on floating-point adds/subs.
42607static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
42608 const X86Subtarget &Subtarget) {
42609 EVT VT = N->getValueType(0);
42610 SDValue LHS = N->getOperand(0);
42611 SDValue RHS = N->getOperand(1);
42612 bool IsFadd = N->getOpcode() == ISD::FADD;
42613 auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
42614 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42614, __PRETTY_FUNCTION__))
;
42615
42616 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
42617 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
42618 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
42619 isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
42620 return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
42621
42622 return SDValue();
42623}
42624
42625/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
42626/// the codegen.
42627/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
42628/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
42629/// anything that is guaranteed to be transformed by DAGCombiner.
42630static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
42631 const X86Subtarget &Subtarget,
42632 const SDLoc &DL) {
42633 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42633, __PRETTY_FUNCTION__))
;
42634 SDValue Src = N->getOperand(0);
42635 unsigned SrcOpcode = Src.getOpcode();
42636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42637
42638 EVT VT = N->getValueType(0);
42639 EVT SrcVT = Src.getValueType();
42640
42641 auto IsFreeTruncation = [VT](SDValue Op) {
42642 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
42643
42644 // See if this has been extended from a smaller/equal size to
42645 // the truncation size, allowing a truncation to combine with the extend.
42646 unsigned Opcode = Op.getOpcode();
42647 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
42648 Opcode == ISD::ZERO_EXTEND) &&
42649 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
42650 return true;
42651
42652 // See if this is a single use constant which can be constant folded.
42653 // NOTE: We don't peek throught bitcasts here because there is currently
42654 // no support for constant folding truncate+bitcast+vector_of_constants. So
42655 // we'll just send up with a truncate on both operands which will
42656 // get turned back into (truncate (binop)) causing an infinite loop.
42657 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
42658 };
42659
42660 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
42661 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
42662 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
42663 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
42664 };
42665
42666 // Don't combine if the operation has other uses.
42667 if (!Src.hasOneUse())
42668 return SDValue();
42669
42670 // Only support vector truncation for now.
42671 // TODO: i64 scalar math would benefit as well.
42672 if (!VT.isVector())
42673 return SDValue();
42674
42675 // In most cases its only worth pre-truncating if we're only facing the cost
42676 // of one truncation.
42677 // i.e. if one of the inputs will constant fold or the input is repeated.
42678 switch (SrcOpcode) {
42679 case ISD::AND:
42680 case ISD::XOR:
42681 case ISD::OR: {
42682 SDValue Op0 = Src.getOperand(0);
42683 SDValue Op1 = Src.getOperand(1);
42684 if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
42685 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
42686 return TruncateArithmetic(Op0, Op1);
42687 break;
42688 }
42689
42690 case ISD::MUL:
42691 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
42692 // better to truncate if we have the chance.
42693 if (SrcVT.getScalarType() == MVT::i64 &&
42694 TLI.isOperationLegal(SrcOpcode, VT) &&
42695 !TLI.isOperationLegal(SrcOpcode, SrcVT))
42696 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
42697 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42698 case ISD::ADD: {
42699 SDValue Op0 = Src.getOperand(0);
42700 SDValue Op1 = Src.getOperand(1);
42701 if (TLI.isOperationLegal(SrcOpcode, VT) &&
42702 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
42703 return TruncateArithmetic(Op0, Op1);
42704 break;
42705 }
42706 case ISD::SUB: {
42707 // TODO: ISD::SUB We are conservative and require both sides to be freely
42708 // truncatable to avoid interfering with combineSubToSubus.
42709 SDValue Op0 = Src.getOperand(0);
42710 SDValue Op1 = Src.getOperand(1);
42711 if (TLI.isOperationLegal(SrcOpcode, VT) &&
42712 (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
42713 return TruncateArithmetic(Op0, Op1);
42714 break;
42715 }
42716 }
42717
42718 return SDValue();
42719}
42720
42721/// Truncate using ISD::AND mask and X86ISD::PACKUS.
42722/// e.g. trunc <8 x i32> X to <8 x i16> -->
42723/// MaskX = X & 0xffff (clear high bits to prevent saturation)
42724/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
42725static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
42726 const X86Subtarget &Subtarget,
42727 SelectionDAG &DAG) {
42728 SDValue In = N->getOperand(0);
42729 EVT InVT = In.getValueType();
42730 EVT OutVT = N->getValueType(0);
42731
42732 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
42733 OutVT.getScalarSizeInBits());
42734 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
42735 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
42736}
42737
42738/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
42739static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
42740 const X86Subtarget &Subtarget,
42741 SelectionDAG &DAG) {
42742 SDValue In = N->getOperand(0);
42743 EVT InVT = In.getValueType();
42744 EVT OutVT = N->getValueType(0);
42745 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
42746 DAG.getValueType(OutVT));
42747 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
42748}
42749
42750/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
42751/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
42752/// legalization the truncation will be translated into a BUILD_VECTOR with each
42753/// element that is extracted from a vector and then truncated, and it is
42754/// difficult to do this optimization based on them.
42755static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
42756 const X86Subtarget &Subtarget) {
42757 EVT OutVT = N->getValueType(0);
42758 if (!OutVT.isVector())
42759 return SDValue();
42760
42761 SDValue In = N->getOperand(0);
42762 if (!In.getValueType().isSimple())
42763 return SDValue();
42764
42765 EVT InVT = In.getValueType();
42766 unsigned NumElems = OutVT.getVectorNumElements();
42767
42768 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
42769 // SSE2, and we need to take care of it specially.
42770 // AVX512 provides vpmovdb.
42771 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
42772 return SDValue();
42773
42774 EVT OutSVT = OutVT.getVectorElementType();
42775 EVT InSVT = InVT.getVectorElementType();
42776 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
42777 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
42778 NumElems >= 8))
42779 return SDValue();
42780
42781 // SSSE3's pshufb results in less instructions in the cases below.
42782 if (Subtarget.hasSSSE3() && NumElems == 8 &&
42783 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
42784 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
42785 return SDValue();
42786
42787 SDLoc DL(N);
42788 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
42789 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
42790 // truncate 2 x v4i32 to v8i16.
42791 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
42792 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
42793 if (InSVT == MVT::i32)
42794 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
42795
42796 return SDValue();
42797}
42798
42799/// This function transforms vector truncation of 'extended sign-bits' or
42800/// 'extended zero-bits' values.
42801/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
42802static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
42803 SelectionDAG &DAG,
42804 const X86Subtarget &Subtarget) {
42805 // Requires SSE2.
42806 if (!Subtarget.hasSSE2())
42807 return SDValue();
42808
42809 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
42810 return SDValue();
42811
42812 SDValue In = N->getOperand(0);
42813 if (!In.getValueType().isSimple())
42814 return SDValue();
42815
42816 MVT VT = N->getValueType(0).getSimpleVT();
42817 MVT SVT = VT.getScalarType();
42818
42819 MVT InVT = In.getValueType().getSimpleVT();
42820 MVT InSVT = InVT.getScalarType();
42821
42822 // Check we have a truncation suited for PACKSS/PACKUS.
42823 if (!VT.is128BitVector() && !VT.is256BitVector())
42824 return SDValue();
42825 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
42826 return SDValue();
42827 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
42828 return SDValue();
42829
42830 // AVX512 has fast truncate, but if the input is already going to be split,
42831 // there's no harm in trying pack.
42832 if (Subtarget.hasAVX512() &&
42833 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
42834 InVT.is512BitVector()))
42835 return SDValue();
42836
42837 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
42838 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
42839
42840 // Use PACKUS if the input has zero-bits that extend all the way to the
42841 // packed/truncated value. e.g. masks, zext_in_reg, etc.
42842 KnownBits Known = DAG.computeKnownBits(In);
42843 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
42844 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
42845 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
42846
42847 // Use PACKSS if the input has sign-bits that extend all the way to the
42848 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
42849 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
42850 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
42851 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
42852
42853 return SDValue();
42854}
42855
42856// Try to form a MULHU or MULHS node by looking for
42857// (trunc (srl (mul ext, ext), 16))
42858// TODO: This is X86 specific because we want to be able to handle wide types
42859// before type legalization. But we can only do it if the vector will be
42860// legalized via widening/splitting. Type legalization can't handle promotion
42861// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
42862// combiner.
42863static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
42864 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
42865 // First instruction should be a right shift of a multiply.
42866 if (Src.getOpcode() != ISD::SRL ||
42867 Src.getOperand(0).getOpcode() != ISD::MUL)
42868 return SDValue();
42869
42870 if (!Subtarget.hasSSE2())
42871 return SDValue();
42872
42873 // Only handle vXi16 types that are at least 128-bits unless they will be
42874 // widened.
42875 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
42876 return SDValue();
42877
42878 // Input type should be vXi32.
42879 EVT InVT = Src.getValueType();
42880 if (InVT.getVectorElementType() != MVT::i32)
42881 return SDValue();
42882
42883 // Need a shift by 16.
42884 APInt ShiftAmt;
42885 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
42886 ShiftAmt != 16)
42887 return SDValue();
42888
42889 SDValue LHS = Src.getOperand(0).getOperand(0);
42890 SDValue RHS = Src.getOperand(0).getOperand(1);
42891
42892 unsigned ExtOpc = LHS.getOpcode();
42893 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
42894 RHS.getOpcode() != ExtOpc)
42895 return SDValue();
42896
42897 // Peek through the extends.
42898 LHS = LHS.getOperand(0);
42899 RHS = RHS.getOperand(0);
42900
42901 // Ensure the input types match.
42902 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
42903 return SDValue();
42904
42905 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
42906 return DAG.getNode(Opc, DL, VT, LHS, RHS);
42907}
42908
42909// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
42910// from one vector with signed bytes from another vector, adds together
42911// adjacent pairs of 16-bit products, and saturates the result before
42912// truncating to 16-bits.
42913//
42914// Which looks something like this:
42915// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
42916// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
42917static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
42918 const X86Subtarget &Subtarget,
42919 const SDLoc &DL) {
42920 if (!VT.isVector() || !Subtarget.hasSSSE3())
42921 return SDValue();
42922
42923 unsigned NumElems = VT.getVectorNumElements();
42924 EVT ScalarVT = VT.getVectorElementType();
42925 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
42926 return SDValue();
42927
42928 SDValue SSatVal = detectSSatPattern(In, VT);
42929 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
42930 return SDValue();
42931
42932 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
42933 // of multiplies from even/odd elements.
42934 SDValue N0 = SSatVal.getOperand(0);
42935 SDValue N1 = SSatVal.getOperand(1);
42936
42937 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
42938 return SDValue();
42939
42940 SDValue N00 = N0.getOperand(0);
42941 SDValue N01 = N0.getOperand(1);
42942 SDValue N10 = N1.getOperand(0);
42943 SDValue N11 = N1.getOperand(1);
42944
42945 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
42946 // Canonicalize zero_extend to LHS.
42947 if (N01.getOpcode() == ISD::ZERO_EXTEND)
42948 std::swap(N00, N01);
42949 if (N11.getOpcode() == ISD::ZERO_EXTEND)
42950 std::swap(N10, N11);
42951
42952 // Ensure we have a zero_extend and a sign_extend.
42953 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
42954 N01.getOpcode() != ISD::SIGN_EXTEND ||
42955 N10.getOpcode() != ISD::ZERO_EXTEND ||
42956 N11.getOpcode() != ISD::SIGN_EXTEND)
42957 return SDValue();
42958
42959 // Peek through the extends.
42960 N00 = N00.getOperand(0);
42961 N01 = N01.getOperand(0);
42962 N10 = N10.getOperand(0);
42963 N11 = N11.getOperand(0);
42964
42965 // Ensure the extend is from vXi8.
42966 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
42967 N01.getValueType().getVectorElementType() != MVT::i8 ||
42968 N10.getValueType().getVectorElementType() != MVT::i8 ||
42969 N11.getValueType().getVectorElementType() != MVT::i8)
42970 return SDValue();
42971
42972 // All inputs should be build_vectors.
42973 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
42974 N01.getOpcode() != ISD::BUILD_VECTOR ||
42975 N10.getOpcode() != ISD::BUILD_VECTOR ||
42976 N11.getOpcode() != ISD::BUILD_VECTOR)
42977 return SDValue();
42978
42979 // N00/N10 are zero extended. N01/N11 are sign extended.
42980
42981 // For each element, we need to ensure we have an odd element from one vector
42982 // multiplied by the odd element of another vector and the even element from
42983 // one of the same vectors being multiplied by the even element from the
42984 // other vector. So we need to make sure for each element i, this operator
42985 // is being performed:
42986 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
42987 SDValue ZExtIn, SExtIn;
42988 for (unsigned i = 0; i != NumElems; ++i) {
42989 SDValue N00Elt = N00.getOperand(i);
42990 SDValue N01Elt = N01.getOperand(i);
42991 SDValue N10Elt = N10.getOperand(i);
42992 SDValue N11Elt = N11.getOperand(i);
42993 // TODO: Be more tolerant to undefs.
42994 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
42995 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
42996 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
42997 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
42998 return SDValue();
42999 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
43000 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
43001 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
43002 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
43003 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
43004 return SDValue();
43005 unsigned IdxN00 = ConstN00Elt->getZExtValue();
43006 unsigned IdxN01 = ConstN01Elt->getZExtValue();
43007 unsigned IdxN10 = ConstN10Elt->getZExtValue();
43008 unsigned IdxN11 = ConstN11Elt->getZExtValue();
43009 // Add is commutative so indices can be reordered.
43010 if (IdxN00 > IdxN10) {
43011 std::swap(IdxN00, IdxN10);
43012 std::swap(IdxN01, IdxN11);
43013 }
43014 // N0 indices be the even element. N1 indices must be the next odd element.
43015 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
43016 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
43017 return SDValue();
43018 SDValue N00In = N00Elt.getOperand(0);
43019 SDValue N01In = N01Elt.getOperand(0);
43020 SDValue N10In = N10Elt.getOperand(0);
43021 SDValue N11In = N11Elt.getOperand(0);
43022 // First time we find an input capture it.
43023 if (!ZExtIn) {
43024 ZExtIn = N00In;
43025 SExtIn = N01In;
43026 }
43027 if (ZExtIn != N00In || SExtIn != N01In ||
43028 ZExtIn != N10In || SExtIn != N11In)
43029 return SDValue();
43030 }
43031
43032 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43033 ArrayRef<SDValue> Ops) {
43034 // Shrink by adding truncate nodes and let DAGCombine fold with the
43035 // sources.
43036 EVT InVT = Ops[0].getValueType();
43037 assert(InVT.getScalarType() == MVT::i8 &&((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43038, __PRETTY_FUNCTION__))
43038 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43038, __PRETTY_FUNCTION__))
;
43039 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43039, __PRETTY_FUNCTION__))
;
43040 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
43041 InVT.getVectorNumElements() / 2);
43042 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
43043 };
43044 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
43045 PMADDBuilder);
43046}
43047
43048static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
43049 const X86Subtarget &Subtarget) {
43050 EVT VT = N->getValueType(0);
43051 SDValue Src = N->getOperand(0);
43052 SDLoc DL(N);
43053
43054 // Attempt to pre-truncate inputs to arithmetic ops instead.
43055 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
43056 return V;
43057
43058 // Try to detect AVG pattern first.
43059 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
43060 return Avg;
43061
43062 // Try to detect PMADD
43063 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
43064 return PMAdd;
43065
43066 // Try to combine truncation with signed/unsigned saturation.
43067 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
43068 return Val;
43069
43070 // Try to combine PMULHUW/PMULHW for vXi16.
43071 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
43072 return V;
43073
43074 // The bitcast source is a direct mmx result.
43075 // Detect bitcasts between i32 to x86mmx
43076 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
43077 SDValue BCSrc = Src.getOperand(0);
43078 if (BCSrc.getValueType() == MVT::x86mmx)
43079 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
43080 }
43081
43082 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
43083 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
43084 return V;
43085
43086 return combineVectorTruncation(N, DAG, Subtarget);
43087}
43088
43089static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
43090 EVT VT = N->getValueType(0);
43091 SDValue In = N->getOperand(0);
43092 SDLoc DL(N);
43093
43094 if (auto SSatVal = detectSSatPattern(In, VT))
43095 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
43096 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
43097 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
43098
43099 return SDValue();
43100}
43101
43102/// Returns the negated value if the node \p N flips sign of FP value.
43103///
43104/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
43105/// or FSUB(0, x)
43106/// AVX512F does not have FXOR, so FNEG is lowered as
43107/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
43108/// In this case we go though all bitcasts.
43109/// This also recognizes splat of a negated value and returns the splat of that
43110/// value.
43111static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
43112 if (N->getOpcode() == ISD::FNEG)
43113 return N->getOperand(0);
43114
43115 // Don't recurse exponentially.
43116 if (Depth > SelectionDAG::MaxRecursionDepth)
43117 return SDValue();
43118
43119 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
43120
43121 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
43122 EVT VT = Op->getValueType(0);
43123
43124 // Make sure the element size doesn't change.
43125 if (VT.getScalarSizeInBits() != ScalarSize)
43126 return SDValue();
43127
43128 unsigned Opc = Op.getOpcode();
43129 switch (Opc) {
43130 case ISD::VECTOR_SHUFFLE: {
43131 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
43132 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
43133 if (!Op.getOperand(1).isUndef())
43134 return SDValue();
43135 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
43136 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
43137 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
43138 cast<ShuffleVectorSDNode>(Op)->getMask());
43139 break;
43140 }
43141 case ISD::INSERT_VECTOR_ELT: {
43142 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
43143 // -V, INDEX).
43144 SDValue InsVector = Op.getOperand(0);
43145 SDValue InsVal = Op.getOperand(1);
43146 if (!InsVector.isUndef())
43147 return SDValue();
43148 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
43149 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
43150 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
43151 NegInsVal, Op.getOperand(2));
43152 break;
43153 }
43154 case ISD::FSUB:
43155 case ISD::XOR:
43156 case X86ISD::FXOR: {
43157 SDValue Op1 = Op.getOperand(1);
43158 SDValue Op0 = Op.getOperand(0);
43159
43160 // For XOR and FXOR, we want to check if constant
43161 // bits of Op1 are sign bit masks. For FSUB, we
43162 // have to check if constant bits of Op0 are sign
43163 // bit masks and hence we swap the operands.
43164 if (Opc == ISD::FSUB)
43165 std::swap(Op0, Op1);
43166
43167 APInt UndefElts;
43168 SmallVector<APInt, 16> EltBits;
43169 // Extract constant bits and see if they are all
43170 // sign bit masks. Ignore the undef elements.
43171 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
43172 /* AllowWholeUndefs */ true,
43173 /* AllowPartialUndefs */ false)) {
43174 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
43175 if (!UndefElts[I] && !EltBits[I].isSignMask())
43176 return SDValue();
43177
43178 return peekThroughBitcasts(Op0);
43179 }
43180 }
43181 }
43182
43183 return SDValue();
43184}
43185
43186static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
43187 bool NegRes) {
43188 if (NegMul) {
43189 switch (Opcode) {
43190 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43190)
;
43191 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
43192 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
43193 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
43194 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
43195 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
43196 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
43197 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
43198 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
43199 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
43200 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
43201 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
43202 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
43203 }
43204 }
43205
43206 if (NegAcc) {
43207 switch (Opcode) {
43208 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43208)
;
43209 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
43210 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
43211 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
43212 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
43213 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
43214 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
43215 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
43216 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
43217 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
43218 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
43219 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
43220 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
43221 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
43222 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
43223 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
43224 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
43225 }
43226 }
43227
43228 if (NegRes) {
43229 switch (Opcode) {
43230 // For accuracy reason, we never combine fneg and fma under strict FP.
43231 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43231)
;
43232 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
43233 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
43234 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
43235 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
43236 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
43237 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
43238 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
43239 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
43240 }
43241 }
43242
43243 return Opcode;
43244}
43245
43246/// Do target-specific dag combines on floating point negations.
43247static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
43248 TargetLowering::DAGCombinerInfo &DCI,
43249 const X86Subtarget &Subtarget) {
43250 EVT OrigVT = N->getValueType(0);
43251 SDValue Arg = isFNEG(DAG, N);
43252 if (!Arg)
43253 return SDValue();
43254
43255 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43256 EVT VT = Arg.getValueType();
43257 EVT SVT = VT.getScalarType();
43258 SDLoc DL(N);
43259
43260 // Let legalize expand this if it isn't a legal type yet.
43261 if (!TLI.isTypeLegal(VT))
43262 return SDValue();
43263
43264 // If we're negating a FMUL node on a target with FMA, then we can avoid the
43265 // use of a constant by performing (-0 - A*B) instead.
43266 // FIXME: Check rounding control flags as well once it becomes available.
43267 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
43268 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
43269 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
43270 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
43271 Arg.getOperand(1), Zero);
43272 return DAG.getBitcast(OrigVT, NewNode);
43273 }
43274
43275 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
43276 bool LegalOperations = !DCI.isBeforeLegalizeOps();
43277 if (TLI.getNegatibleCost(Arg, DAG, LegalOperations, CodeSize) !=
43278 TargetLowering::NegatibleCost::Expensive)
43279 return DAG.getBitcast(
43280 OrigVT, TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize));
43281
43282 return SDValue();
43283}
43284
43285TargetLowering::NegatibleCost
43286X86TargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG,
43287 bool LegalOperations, bool ForCodeSize,
43288 unsigned Depth) const {
43289 // fneg patterns are removable even if they have multiple uses.
43290 if (isFNEG(DAG, Op.getNode(), Depth))
43291 return NegatibleCost::Cheaper;
43292
43293 // Don't recurse exponentially.
43294 if (Depth > SelectionDAG::MaxRecursionDepth)
43295 return NegatibleCost::Expensive;
43296
43297 EVT VT = Op.getValueType();
43298 EVT SVT = VT.getScalarType();
43299 switch (Op.getOpcode()) {
43300 case ISD::FMA:
43301 case X86ISD::FMSUB:
43302 case X86ISD::FNMADD:
43303 case X86ISD::FNMSUB:
43304 case X86ISD::FMADD_RND:
43305 case X86ISD::FMSUB_RND:
43306 case X86ISD::FNMADD_RND:
43307 case X86ISD::FNMSUB_RND: {
43308 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
43309 !(SVT == MVT::f32 || SVT == MVT::f64) ||
43310 !isOperationLegal(ISD::FMA, VT))
43311 break;
43312
43313 // This is always negatible for free but we might be able to remove some
43314 // extra operand negations as well.
43315 for (int i = 0; i != 3; ++i) {
43316 NegatibleCost V = getNegatibleCost(Op.getOperand(i), DAG, LegalOperations,
43317 ForCodeSize, Depth + 1);
43318 if (V == NegatibleCost::Cheaper)
43319 return V;
43320 }
43321 return NegatibleCost::Neutral;
43322 }
43323 case X86ISD::FRCP:
43324 return getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
43325 Depth + 1);
43326 }
43327
43328 return TargetLowering::getNegatibleCost(Op, DAG, LegalOperations, ForCodeSize,
43329 Depth);
43330}
43331
43332SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
43333 bool LegalOperations,
43334 bool ForCodeSize,
43335 unsigned Depth) const {
43336 // fneg patterns are removable even if they have multiple uses.
43337 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
43338 return DAG.getBitcast(Op.getValueType(), Arg);
43339
43340 EVT VT = Op.getValueType();
43341 EVT SVT = VT.getScalarType();
43342 unsigned Opc = Op.getOpcode();
43343 switch (Opc) {
43344 case ISD::FMA:
43345 case X86ISD::FMSUB:
43346 case X86ISD::FNMADD:
43347 case X86ISD::FNMSUB:
43348 case X86ISD::FMADD_RND:
43349 case X86ISD::FMSUB_RND:
43350 case X86ISD::FNMADD_RND:
43351 case X86ISD::FNMSUB_RND: {
43352 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
43353 !(SVT == MVT::f32 || SVT == MVT::f64) ||
43354 !isOperationLegal(ISD::FMA, VT))
43355 break;
43356
43357 // This is always negatible for free but we might be able to remove some
43358 // extra operand negations as well.
43359 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
43360 for (int i = 0; i != 3; ++i) {
43361 NegatibleCost V = getNegatibleCost(Op.getOperand(i), DAG, LegalOperations,
43362 ForCodeSize, Depth + 1);
43363 if (V == NegatibleCost::Cheaper)
43364 NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
43365 ForCodeSize, Depth + 1);
43366 }
43367
43368 bool NegA = !!NewOps[0];
43369 bool NegB = !!NewOps[1];
43370 bool NegC = !!NewOps[2];
43371 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
43372
43373 // Fill in the non-negated ops with the original values.
43374 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
43375 if (!NewOps[i])
43376 NewOps[i] = Op.getOperand(i);
43377 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
43378 }
43379 case X86ISD::FRCP:
43380 return DAG.getNode(Opc, SDLoc(Op), VT,
43381 getNegatedExpression(Op.getOperand(0), DAG,
43382 LegalOperations, ForCodeSize,
43383 Depth + 1));
43384 }
43385
43386 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
43387 ForCodeSize, Depth);
43388}
43389
43390static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
43391 const X86Subtarget &Subtarget) {
43392 MVT VT = N->getSimpleValueType(0);
43393 // If we have integer vector types available, use the integer opcodes.
43394 if (!VT.isVector() || !Subtarget.hasSSE2())
43395 return SDValue();
43396
43397 SDLoc dl(N);
43398
43399 unsigned IntBits = VT.getScalarSizeInBits();
43400 MVT IntSVT = MVT::getIntegerVT(IntBits);
43401 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
43402
43403 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
43404 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
43405 unsigned IntOpcode;
43406 switch (N->getOpcode()) {
43407 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43407)
;
43408 case X86ISD::FOR: IntOpcode = ISD::OR; break;
43409 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
43410 case X86ISD::FAND: IntOpcode = ISD::AND; break;
43411 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
43412 }
43413 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
43414 return DAG.getBitcast(VT, IntOp);
43415}
43416
43417
43418/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
43419static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
43420 if (N->getOpcode() != ISD::XOR)
43421 return SDValue();
43422
43423 SDValue LHS = N->getOperand(0);
43424 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
43425 return SDValue();
43426
43427 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
43428 X86::CondCode(LHS->getConstantOperandVal(0)));
43429 SDLoc DL(N);
43430 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
43431}
43432
43433static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
43434 TargetLowering::DAGCombinerInfo &DCI,
43435 const X86Subtarget &Subtarget) {
43436 // If this is SSE1 only convert to FXOR to avoid scalarization.
43437 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
43438 N->getValueType(0) == MVT::v4i32) {
43439 return DAG.getBitcast(
43440 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
43441 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
43442 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
43443 }
43444
43445 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
43446 return Cmp;
43447
43448 if (DCI.isBeforeLegalizeOps())
43449 return SDValue();
43450
43451 if (SDValue SetCC = foldXor1SetCC(N, DAG))
43452 return SetCC;
43453
43454 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
43455 return RV;
43456
43457 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
43458 return FPLogic;
43459
43460 return combineFneg(N, DAG, DCI, Subtarget);
43461}
43462
43463static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
43464 TargetLowering::DAGCombinerInfo &DCI,
43465 const X86Subtarget &Subtarget) {
43466 EVT VT = N->getValueType(0);
43467 unsigned NumBits = VT.getSizeInBits();
43468
43469 // TODO - Constant Folding.
43470
43471 // Simplify the inputs.
43472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43473 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
43474 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
43475 return SDValue(N, 0);
43476
43477 return SDValue();
43478}
43479
43480static bool isNullFPScalarOrVectorConst(SDValue V) {
43481 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
43482}
43483
43484/// If a value is a scalar FP zero or a vector FP zero (potentially including
43485/// undefined elements), return a zero constant that may be used to fold away
43486/// that value. In the case of a vector, the returned constant will not contain
43487/// undefined elements even if the input parameter does. This makes it suitable
43488/// to be used as a replacement operand with operations (eg, bitwise-and) where
43489/// an undef should not propagate.
43490static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
43491 const X86Subtarget &Subtarget) {
43492 if (!isNullFPScalarOrVectorConst(V))
43493 return SDValue();
43494
43495 if (V.getValueType().isVector())
43496 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
43497
43498 return V;
43499}
43500
43501static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
43502 const X86Subtarget &Subtarget) {
43503 SDValue N0 = N->getOperand(0);
43504 SDValue N1 = N->getOperand(1);
43505 EVT VT = N->getValueType(0);
43506 SDLoc DL(N);
43507
43508 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
43509 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
43510 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
43511 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
43512 return SDValue();
43513
43514 auto isAllOnesConstantFP = [](SDValue V) {
43515 if (V.getSimpleValueType().isVector())
43516 return ISD::isBuildVectorAllOnes(V.getNode());
43517 auto *C = dyn_cast<ConstantFPSDNode>(V);
43518 return C && C->getConstantFPValue()->isAllOnesValue();
43519 };
43520
43521 // fand (fxor X, -1), Y --> fandn X, Y
43522 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
43523 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
43524
43525 // fand X, (fxor Y, -1) --> fandn Y, X
43526 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
43527 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
43528
43529 return SDValue();
43530}
43531
43532/// Do target-specific dag combines on X86ISD::FAND nodes.
43533static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
43534 const X86Subtarget &Subtarget) {
43535 // FAND(0.0, x) -> 0.0
43536 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
43537 return V;
43538
43539 // FAND(x, 0.0) -> 0.0
43540 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
43541 return V;
43542
43543 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
43544 return V;
43545
43546 return lowerX86FPLogicOp(N, DAG, Subtarget);
43547}
43548
43549/// Do target-specific dag combines on X86ISD::FANDN nodes.
43550static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
43551 const X86Subtarget &Subtarget) {
43552 // FANDN(0.0, x) -> x
43553 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
43554 return N->getOperand(1);
43555
43556 // FANDN(x, 0.0) -> 0.0
43557 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
43558 return V;
43559
43560 return lowerX86FPLogicOp(N, DAG, Subtarget);
43561}
43562
43563/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
43564static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
43565 TargetLowering::DAGCombinerInfo &DCI,
43566 const X86Subtarget &Subtarget) {
43567 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43567, __PRETTY_FUNCTION__))
;
43568
43569 // F[X]OR(0.0, x) -> x
43570 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
43571 return N->getOperand(1);
43572
43573 // F[X]OR(x, 0.0) -> x
43574 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
43575 return N->getOperand(0);
43576
43577 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
43578 return NewVal;
43579
43580 return lowerX86FPLogicOp(N, DAG, Subtarget);
43581}
43582
43583/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
43584static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
43585 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43585, __PRETTY_FUNCTION__))
;
43586
43587 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
43588 if (!DAG.getTarget().Options.NoNaNsFPMath ||
43589 !DAG.getTarget().Options.NoSignedZerosFPMath)
43590 return SDValue();
43591
43592 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
43593 // into FMINC and FMAXC, which are Commutative operations.
43594 unsigned NewOp = 0;
43595 switch (N->getOpcode()) {
43596 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43596)
;
43597 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
43598 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
43599 }
43600
43601 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
43602 N->getOperand(0), N->getOperand(1));
43603}
43604
43605static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
43606 const X86Subtarget &Subtarget) {
43607 if (Subtarget.useSoftFloat())
43608 return SDValue();
43609
43610 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43611
43612 EVT VT = N->getValueType(0);
43613 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43614 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43615 (VT.isVector() && TLI.isTypeLegal(VT))))
43616 return SDValue();
43617
43618 SDValue Op0 = N->getOperand(0);
43619 SDValue Op1 = N->getOperand(1);
43620 SDLoc DL(N);
43621 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
43622
43623 // If we don't have to respect NaN inputs, this is a direct translation to x86
43624 // min/max instructions.
43625 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
43626 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
43627
43628 // If one of the operands is known non-NaN use the native min/max instructions
43629 // with the non-NaN input as second operand.
43630 if (DAG.isKnownNeverNaN(Op1))
43631 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
43632 if (DAG.isKnownNeverNaN(Op0))
43633 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
43634
43635 // If we have to respect NaN inputs, this takes at least 3 instructions.
43636 // Favor a library call when operating on a scalar and minimizing code size.
43637 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
43638 return SDValue();
43639
43640 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
43641 VT);
43642
43643 // There are 4 possibilities involving NaN inputs, and these are the required
43644 // outputs:
43645 // Op1
43646 // Num NaN
43647 // ----------------
43648 // Num | Max | Op0 |
43649 // Op0 ----------------
43650 // NaN | Op1 | NaN |
43651 // ----------------
43652 //
43653 // The SSE FP max/min instructions were not designed for this case, but rather
43654 // to implement:
43655 // Min = Op1 < Op0 ? Op1 : Op0
43656 // Max = Op1 > Op0 ? Op1 : Op0
43657 //
43658 // So they always return Op0 if either input is a NaN. However, we can still
43659 // use those instructions for fmaxnum by selecting away a NaN input.
43660
43661 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
43662 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
43663 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
43664
43665 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
43666 // are NaN, the NaN value of Op1 is the result.
43667 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
43668}
43669
43670static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
43671 TargetLowering::DAGCombinerInfo &DCI) {
43672 EVT VT = N->getValueType(0);
43673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43674
43675 APInt KnownUndef, KnownZero;
43676 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
43677 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
43678 KnownZero, DCI))
43679 return SDValue(N, 0);
43680
43681 // Convert a full vector load into vzload when not all bits are needed.
43682 SDValue In = N->getOperand(0);
43683 MVT InVT = In.getSimpleValueType();
43684 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
43685 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
43686 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43686, __PRETTY_FUNCTION__))
;
43687 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
43688 // Unless the load is volatile or atomic.
43689 if (LN->isSimple()) {
43690 SDLoc dl(N);
43691 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
43692 MVT MemVT = MVT::getIntegerVT(NumBits);
43693 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
43694 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43695 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43696 SDValue VZLoad =
43697 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
43698 LN->getPointerInfo(),
43699 LN->getAlignment(),
43700 LN->getMemOperand()->getFlags());
43701 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
43702 DAG.getBitcast(InVT, VZLoad));
43703 DCI.CombineTo(N, Convert);
43704 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43705 return SDValue(N, 0);
43706 }
43707 }
43708
43709 return SDValue();
43710}
43711
43712static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
43713 TargetLowering::DAGCombinerInfo &DCI) {
43714 // FIXME: Handle strict fp nodes.
43715 EVT VT = N->getValueType(0);
43716
43717 // Convert a full vector load into vzload when not all bits are needed.
43718 SDValue In = N->getOperand(0);
43719 MVT InVT = In.getSimpleValueType();
43720 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
43721 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
43722 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43722, __PRETTY_FUNCTION__))
;
43723 LoadSDNode *LN = cast<LoadSDNode>(In);
43724 // Unless the load is volatile or atomic.
43725 if (LN->isSimple()) {
43726 SDLoc dl(N);
43727 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
43728 MVT MemVT = MVT::getFloatingPointVT(NumBits);
43729 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
43730 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43731 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43732 SDValue VZLoad =
43733 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
43734 LN->getPointerInfo(),
43735 LN->getAlignment(),
43736 LN->getMemOperand()->getFlags());
43737 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
43738 DAG.getBitcast(InVT, VZLoad));
43739 DCI.CombineTo(N, Convert);
43740 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43741 return SDValue(N, 0);
43742 }
43743 }
43744
43745 return SDValue();
43746}
43747
43748/// Do target-specific dag combines on X86ISD::ANDNP nodes.
43749static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
43750 TargetLowering::DAGCombinerInfo &DCI,
43751 const X86Subtarget &Subtarget) {
43752 MVT VT = N->getSimpleValueType(0);
43753
43754 // ANDNP(0, x) -> x
43755 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
43756 return N->getOperand(1);
43757
43758 // ANDNP(x, 0) -> 0
43759 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
43760 return DAG.getConstant(0, SDLoc(N), VT);
43761
43762 // Turn ANDNP back to AND if input is inverted.
43763 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
43764 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
43765 N->getOperand(1));
43766
43767 // Attempt to recursively combine a bitmask ANDNP with shuffles.
43768 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
43769 SDValue Op(N, 0);
43770 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43771 return Res;
43772 }
43773
43774 return SDValue();
43775}
43776
43777static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
43778 TargetLowering::DAGCombinerInfo &DCI) {
43779 SDValue N1 = N->getOperand(1);
43780
43781 // BT ignores high bits in the bit index operand.
43782 unsigned BitWidth = N1.getValueSizeInBits();
43783 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
43784 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
43785 DCI.AddToWorklist(N);
43786 return SDValue(N, 0);
43787 }
43788
43789 return SDValue();
43790}
43791
43792static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
43793 TargetLowering::DAGCombinerInfo &DCI) {
43794 SDValue Src = N->getOperand(0);
43795
43796 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
43797 APInt KnownUndef, KnownZero;
43798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43799 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
43800 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43801 DCI)) {
43802 DCI.AddToWorklist(N);
43803 return SDValue(N, 0);
43804 }
43805
43806 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
43807 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
43808 // Unless the load is volatile or atomic.
43809 if (LN->isSimple()) {
43810 SDLoc dl(N);
43811 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
43812 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43813 SDValue VZLoad =
43814 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64,
43815 LN->getPointerInfo(),
43816 LN->getAlignment(),
43817 LN->getMemOperand()->getFlags());
43818 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
43819 DAG.getBitcast(MVT::v8i16, VZLoad));
43820 DCI.CombineTo(N, Convert);
43821 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43822 return SDValue(N, 0);
43823 }
43824 }
43825 }
43826
43827 return SDValue();
43828}
43829
43830// Try to combine sext_in_reg of a cmov of constants by extending the constants.
43831static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
43832 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43832, __PRETTY_FUNCTION__))
;
43833
43834 EVT DstVT = N->getValueType(0);
43835
43836 SDValue N0 = N->getOperand(0);
43837 SDValue N1 = N->getOperand(1);
43838 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
43839
43840 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
43841 return SDValue();
43842
43843 // Look through single use any_extends / truncs.
43844 SDValue IntermediateBitwidthOp;
43845 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
43846 N0.hasOneUse()) {
43847 IntermediateBitwidthOp = N0;
43848 N0 = N0.getOperand(0);
43849 }
43850
43851 // See if we have a single use cmov.
43852 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
43853 return SDValue();
43854
43855 SDValue CMovOp0 = N0.getOperand(0);
43856 SDValue CMovOp1 = N0.getOperand(1);
43857
43858 // Make sure both operands are constants.
43859 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
43860 !isa<ConstantSDNode>(CMovOp1.getNode()))
43861 return SDValue();
43862
43863 SDLoc DL(N);
43864
43865 // If we looked through an any_extend/trunc above, add one to the constants.
43866 if (IntermediateBitwidthOp) {
43867 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
43868 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
43869 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
43870 }
43871
43872 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
43873 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
43874
43875 EVT CMovVT = DstVT;
43876 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
43877 if (DstVT == MVT::i16) {
43878 CMovVT = MVT::i32;
43879 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
43880 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
43881 }
43882
43883 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
43884 N0.getOperand(2), N0.getOperand(3));
43885
43886 if (CMovVT != DstVT)
43887 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
43888
43889 return CMov;
43890}
43891
43892static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
43893 const X86Subtarget &Subtarget) {
43894 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43894, __PRETTY_FUNCTION__))
;
43895
43896 if (SDValue V = combineSextInRegCmov(N, DAG))
43897 return V;
43898
43899 EVT VT = N->getValueType(0);
43900 SDValue N0 = N->getOperand(0);
43901 SDValue N1 = N->getOperand(1);
43902 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
43903 SDLoc dl(N);
43904
43905 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
43906 // both SSE and AVX2 since there is no sign-extended shift right
43907 // operation on a vector with 64-bit elements.
43908 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
43909 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
43910 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
43911 N0.getOpcode() == ISD::SIGN_EXTEND)) {
43912 SDValue N00 = N0.getOperand(0);
43913
43914 // EXTLOAD has a better solution on AVX2,
43915 // it may be replaced with X86ISD::VSEXT node.
43916 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
43917 if (!ISD::isNormalLoad(N00.getNode()))
43918 return SDValue();
43919
43920 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
43921 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
43922 N00, N1);
43923 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
43924 }
43925 }
43926 return SDValue();
43927}
43928
43929/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
43930/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
43931/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
43932/// opportunities to combine math ops, use an LEA, or use a complex addressing
43933/// mode. This can eliminate extend, add, and shift instructions.
43934static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
43935 const X86Subtarget &Subtarget) {
43936 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
43937 Ext->getOpcode() != ISD::ZERO_EXTEND)
43938 return SDValue();
43939
43940 // TODO: This should be valid for other integer types.
43941 EVT VT = Ext->getValueType(0);
43942 if (VT != MVT::i64)
43943 return SDValue();
43944
43945 SDValue Add = Ext->getOperand(0);
43946 if (Add.getOpcode() != ISD::ADD)
43947 return SDValue();
43948
43949 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
43950 bool NSW = Add->getFlags().hasNoSignedWrap();
43951 bool NUW = Add->getFlags().hasNoUnsignedWrap();
43952
43953 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
43954 // into the 'zext'
43955 if ((Sext && !NSW) || (!Sext && !NUW))
43956 return SDValue();
43957
43958 // Having a constant operand to the 'add' ensures that we are not increasing
43959 // the instruction count because the constant is extended for free below.
43960 // A constant operand can also become the displacement field of an LEA.
43961 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
43962 if (!AddOp1)
43963 return SDValue();
43964
43965 // Don't make the 'add' bigger if there's no hope of combining it with some
43966 // other 'add' or 'shl' instruction.
43967 // TODO: It may be profitable to generate simpler LEA instructions in place
43968 // of single 'add' instructions, but the cost model for selecting an LEA
43969 // currently has a high threshold.
43970 bool HasLEAPotential = false;
43971 for (auto *User : Ext->uses()) {
43972 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
43973 HasLEAPotential = true;
43974 break;
43975 }
43976 }
43977 if (!HasLEAPotential)
43978 return SDValue();
43979
43980 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
43981 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
43982 SDValue AddOp0 = Add.getOperand(0);
43983 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
43984 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
43985
43986 // The wider add is guaranteed to not wrap because both operands are
43987 // sign-extended.
43988 SDNodeFlags Flags;
43989 Flags.setNoSignedWrap(NSW);
43990 Flags.setNoUnsignedWrap(NUW);
43991 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
43992}
43993
43994// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
43995// operands and the result of CMOV is not used anywhere else - promote CMOV
43996// itself instead of promoting its result. This could be beneficial, because:
43997// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
43998// (or more) pseudo-CMOVs only when they go one-after-another and
43999// getting rid of result extension code after CMOV will help that.
44000// 2) Promotion of constant CMOV arguments is free, hence the
44001// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
44002// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
44003// promotion is also good in terms of code-size.
44004// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
44005// promotion).
44006static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
44007 SDValue CMovN = Extend->getOperand(0);
44008 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
44009 return SDValue();
44010
44011 EVT TargetVT = Extend->getValueType(0);
44012 unsigned ExtendOpcode = Extend->getOpcode();
44013 SDLoc DL(Extend);
44014
44015 EVT VT = CMovN.getValueType();
44016 SDValue CMovOp0 = CMovN.getOperand(0);
44017 SDValue CMovOp1 = CMovN.getOperand(1);
44018
44019 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
44020 !isa<ConstantSDNode>(CMovOp1.getNode()))
44021 return SDValue();
44022
44023 // Only extend to i32 or i64.
44024 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
44025 return SDValue();
44026
44027 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
44028 // are free.
44029 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
44030 return SDValue();
44031
44032 // If this a zero extend to i64, we should only extend to i32 and use a free
44033 // zero extend to finish.
44034 EVT ExtendVT = TargetVT;
44035 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
44036 ExtendVT = MVT::i32;
44037
44038 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
44039 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
44040
44041 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
44042 CMovN.getOperand(2), CMovN.getOperand(3));
44043
44044 // Finish extending if needed.
44045 if (ExtendVT != TargetVT)
44046 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
44047
44048 return Res;
44049}
44050
44051// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44052// This is more or less the reverse of combineBitcastvxi1.
44053static SDValue
44054combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
44055 TargetLowering::DAGCombinerInfo &DCI,
44056 const X86Subtarget &Subtarget) {
44057 unsigned Opcode = N->getOpcode();
44058 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44059 Opcode != ISD::ANY_EXTEND)
44060 return SDValue();
44061 if (!DCI.isBeforeLegalizeOps())
44062 return SDValue();
44063 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44064 return SDValue();
44065
44066 SDValue N0 = N->getOperand(0);
44067 EVT VT = N->getValueType(0);
44068 EVT SVT = VT.getScalarType();
44069 EVT InSVT = N0.getValueType().getScalarType();
44070 unsigned EltSizeInBits = SVT.getSizeInBits();
44071
44072 // Input type must be extending a bool vector (bit-casted from a scalar
44073 // integer) to legal integer types.
44074 if (!VT.isVector())
44075 return SDValue();
44076 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44077 return SDValue();
44078 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44079 return SDValue();
44080
44081 SDValue N00 = N0.getOperand(0);
44082 EVT SclVT = N0.getOperand(0).getValueType();
44083 if (!SclVT.isScalarInteger())
44084 return SDValue();
44085
44086 SDLoc DL(N);
44087 SDValue Vec;
44088 SmallVector<int, 32> ShuffleMask;
44089 unsigned NumElts = VT.getVectorNumElements();
44090 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44090, __PRETTY_FUNCTION__))
;
44091
44092 // Broadcast the scalar integer to the vector elements.
44093 if (NumElts > EltSizeInBits) {
44094 // If the scalar integer is greater than the vector element size, then we
44095 // must split it down into sub-sections for broadcasting. For example:
44096 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44097 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44098 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44098, __PRETTY_FUNCTION__))
;
44099 unsigned Scale = NumElts / EltSizeInBits;
44100 EVT BroadcastVT =
44101 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44102 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44103 Vec = DAG.getBitcast(VT, Vec);
44104
44105 for (unsigned i = 0; i != Scale; ++i)
44106 ShuffleMask.append(EltSizeInBits, i);
44107 } else {
44108 // For smaller scalar integers, we can simply any-extend it to the vector
44109 // element size (we don't care about the upper bits) and broadcast it to all
44110 // elements.
44111 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44112 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44113 ShuffleMask.append(NumElts, 0);
44114 }
44115 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44116
44117 // Now, mask the relevant bit in each element.
44118 SmallVector<SDValue, 32> Bits;
44119 for (unsigned i = 0; i != NumElts; ++i) {
44120 int BitIdx = (i % EltSizeInBits);
44121 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44122 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44123 }
44124 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44125 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44126
44127 // Compare against the bitmask and extend the result.
44128 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44129 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44130 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44131
44132 // For SEXT, this is now done, otherwise shift the result down for
44133 // zero-extension.
44134 if (Opcode == ISD::SIGN_EXTEND)
44135 return Vec;
44136 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44137 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44138}
44139
44140// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
44141// result type.
44142static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
44143 const X86Subtarget &Subtarget) {
44144 SDValue N0 = N->getOperand(0);
44145 EVT VT = N->getValueType(0);
44146 SDLoc dl(N);
44147
44148 // Only do this combine with AVX512 for vector extends.
44149 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
44150 return SDValue();
44151
44152 // Only combine legal element types.
44153 EVT SVT = VT.getVectorElementType();
44154 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
44155 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
44156 return SDValue();
44157
44158 // We can only do this if the vector size in 256 bits or less.
44159 unsigned Size = VT.getSizeInBits();
44160 if (Size > 256)
44161 return SDValue();
44162
44163 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
44164 // that's the only integer compares with we have.
44165 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
44166 if (ISD::isUnsignedIntSetCC(CC))
44167 return SDValue();
44168
44169 // Only do this combine if the extension will be fully consumed by the setcc.
44170 EVT N00VT = N0.getOperand(0).getValueType();
44171 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
44172 if (Size != MatchingVecType.getSizeInBits())
44173 return SDValue();
44174
44175 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
44176
44177 if (N->getOpcode() == ISD::ZERO_EXTEND)
44178 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
44179
44180 return Res;
44181}
44182
44183static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
44184 TargetLowering::DAGCombinerInfo &DCI,
44185 const X86Subtarget &Subtarget) {
44186 SDValue N0 = N->getOperand(0);
44187 EVT VT = N->getValueType(0);
44188 EVT InVT = N0.getValueType();
44189 SDLoc DL(N);
44190
44191 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
44192 if (!DCI.isBeforeLegalizeOps() &&
44193 N0.getOpcode() == X86ISD::SETCC_CARRY) {
44194 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
44195 N0->getOperand(1));
44196 bool ReplaceOtherUses = !N0.hasOneUse();
44197 DCI.CombineTo(N, Setcc);
44198 // Replace other uses with a truncate of the widened setcc_carry.
44199 if (ReplaceOtherUses) {
44200 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
44201 N0.getValueType(), Setcc);
44202 DCI.CombineTo(N0.getNode(), Trunc);
44203 }
44204
44205 return SDValue(N, 0);
44206 }
44207
44208 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
44209 return NewCMov;
44210
44211 if (!DCI.isBeforeLegalizeOps())
44212 return SDValue();
44213
44214 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
44215 return V;
44216
44217 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
44218 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
44219 // Invert and sign-extend a boolean is the same as zero-extend and subtract
44220 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
44221 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
44222 // sext (xor Bool, -1) --> sub (zext Bool), 1
44223 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
44224 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
44225 }
44226
44227 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
44228 return V;
44229
44230 if (VT.isVector())
44231 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
44232 return R;
44233
44234 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
44235 return NewAdd;
44236
44237 return SDValue();
44238}
44239
44240static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
44241 TargetLowering::DAGCombinerInfo &DCI,
44242 const X86Subtarget &Subtarget) {
44243 SDLoc dl(N);
44244 EVT VT = N->getValueType(0);
44245 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
44246
44247 // Let legalize expand this if it isn't a legal type yet.
44248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44249 if (!TLI.isTypeLegal(VT))
44250 return SDValue();
44251
44252 EVT ScalarVT = VT.getScalarType();
44253 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
44254 return SDValue();
44255
44256 SDValue A = N->getOperand(IsStrict ? 1 : 0);
44257 SDValue B = N->getOperand(IsStrict ? 2 : 1);
44258 SDValue C = N->getOperand(IsStrict ? 3 : 2);
44259
44260 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
44261 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
44262 bool LegalOperations = !DCI.isBeforeLegalizeOps();
44263 if (TLI.getNegatibleCost(V, DAG, LegalOperations, CodeSize) ==
44264 TargetLowering::NegatibleCost::Cheaper) {
44265 V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
44266 return true;
44267 }
44268 // Look through extract_vector_elts. If it comes from an FNEG, create a
44269 // new extract from the FNEG input.
44270 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44271 isNullConstant(V.getOperand(1))) {
44272 SDValue Vec = V.getOperand(0);
44273 if (TLI.getNegatibleCost(Vec, DAG, LegalOperations, CodeSize) ==
44274 TargetLowering::NegatibleCost::Cheaper) {
44275 SDValue NegVal =
44276 TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
44277 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
44278 NegVal, V.getOperand(1));
44279 return true;
44280 }
44281 }
44282
44283 return false;
44284 };
44285
44286 // Do not convert the passthru input of scalar intrinsics.
44287 // FIXME: We could allow negations of the lower element only.
44288 bool NegA = invertIfNegative(A);
44289 bool NegB = invertIfNegative(B);
44290 bool NegC = invertIfNegative(C);
44291
44292 if (!NegA && !NegB && !NegC)
44293 return SDValue();
44294
44295 unsigned NewOpcode =
44296 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
44297
44298 if (IsStrict) {
44299 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((N->getNumOperands() == 4 && "Shouldn't be greater than 4"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44299, __PRETTY_FUNCTION__))
;
44300 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
44301 {N->getOperand(0), A, B, C});
44302 } else {
44303 if (N->getNumOperands() == 4)
44304 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
44305 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
44306 }
44307}
44308
44309// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
44310// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
44311static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
44312 TargetLowering::DAGCombinerInfo &DCI) {
44313 SDLoc dl(N);
44314 EVT VT = N->getValueType(0);
44315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44316 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
44317 bool LegalOperations = !DCI.isBeforeLegalizeOps();
44318
44319 SDValue N2 = N->getOperand(2);
44320 if (TLI.getNegatibleCost(N2, DAG, LegalOperations, CodeSize) !=
44321 TargetLowering::NegatibleCost::Cheaper)
44322 return SDValue();
44323
44324 SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
44325 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
44326
44327 if (N->getNumOperands() == 4)
44328 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
44329 NegN2, N->getOperand(3));
44330 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
44331 NegN2);
44332}
44333
44334static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
44335 TargetLowering::DAGCombinerInfo &DCI,
44336 const X86Subtarget &Subtarget) {
44337 SDLoc dl(N);
44338 SDValue N0 = N->getOperand(0);
44339 EVT VT = N->getValueType(0);
44340
44341 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
44342 // FIXME: Is this needed? We don't seem to have any tests for it.
44343 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
44344 N0.getOpcode() == X86ISD::SETCC_CARRY) {
44345 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
44346 N0->getOperand(1));
44347 bool ReplaceOtherUses = !N0.hasOneUse();
44348 DCI.CombineTo(N, Setcc);
44349 // Replace other uses with a truncate of the widened setcc_carry.
44350 if (ReplaceOtherUses) {
44351 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
44352 N0.getValueType(), Setcc);
44353 DCI.CombineTo(N0.getNode(), Trunc);
44354 }
44355
44356 return SDValue(N, 0);
44357 }
44358
44359 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
44360 return NewCMov;
44361
44362 if (DCI.isBeforeLegalizeOps())
44363 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
44364 return V;
44365
44366 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
44367 return V;
44368
44369 if (VT.isVector())
44370 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
44371 return R;
44372
44373 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
44374 return NewAdd;
44375
44376 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
44377 return R;
44378
44379 // TODO: Combine with any target/faux shuffle.
44380 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
44381 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
44382 SDValue N00 = N0.getOperand(0);
44383 SDValue N01 = N0.getOperand(1);
44384 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
44385 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
44386 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
44387 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
44388 return concatSubVectors(N00, N01, DAG, dl);
44389 }
44390 }
44391
44392 return SDValue();
44393}
44394
44395/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
44396/// recognizable memcmp expansion.
44397static bool isOrXorXorTree(SDValue X, bool Root = true) {
44398 if (X.getOpcode() == ISD::OR)
44399 return isOrXorXorTree(X.getOperand(0), false) &&
44400 isOrXorXorTree(X.getOperand(1), false);
44401 if (Root)
44402 return false;
44403 return X.getOpcode() == ISD::XOR;
44404}
44405
44406/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
44407/// expansion.
44408template<typename F>
44409static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
44410 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
44411 SDValue Op0 = X.getOperand(0);
44412 SDValue Op1 = X.getOperand(1);
44413 if (X.getOpcode() == ISD::OR) {
44414 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
44415 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
44416 if (VecVT != CmpVT)
44417 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
44418 if (HasPT)
44419 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
44420 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
44421 } else if (X.getOpcode() == ISD::XOR) {
44422 SDValue A = SToV(Op0);
44423 SDValue B = SToV(Op1);
44424 if (VecVT != CmpVT)
44425 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
44426 if (HasPT)
44427 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
44428 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
44429 }
44430 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44430)
;
44431}
44432
44433/// Try to map a 128-bit or larger integer comparison to vector instructions
44434/// before type legalization splits it up into chunks.
44435static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
44436 const X86Subtarget &Subtarget) {
44437 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
44438 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44438, __PRETTY_FUNCTION__))
;
44439
44440 // We're looking for an oversized integer equality comparison.
44441 SDValue X = SetCC->getOperand(0);
44442 SDValue Y = SetCC->getOperand(1);
44443 EVT OpVT = X.getValueType();
44444 unsigned OpSize = OpVT.getSizeInBits();
44445 if (!OpVT.isScalarInteger() || OpSize < 128)
44446 return SDValue();
44447
44448 // Ignore a comparison with zero because that gets special treatment in
44449 // EmitTest(). But make an exception for the special case of a pair of
44450 // logically-combined vector-sized operands compared to zero. This pattern may
44451 // be generated by the memcmp expansion pass with oversized integer compares
44452 // (see PR33325).
44453 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
44454 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
44455 return SDValue();
44456
44457 // Don't perform this combine if constructing the vector will be expensive.
44458 auto IsVectorBitCastCheap = [](SDValue X) {
44459 X = peekThroughBitcasts(X);
44460 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
44461 X.getOpcode() == ISD::LOAD;
44462 };
44463 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
44464 !IsOrXorXorTreeCCZero)
44465 return SDValue();
44466
44467 EVT VT = SetCC->getValueType(0);
44468 SDLoc DL(SetCC);
44469 bool HasAVX = Subtarget.hasAVX();
44470
44471 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
44472 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
44473 // Otherwise use PCMPEQ (plus AND) and mask testing.
44474 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
44475 (OpSize == 256 && HasAVX) ||
44476 (OpSize == 512 && Subtarget.useAVX512Regs())) {
44477 bool HasPT = Subtarget.hasSSE41();
44478
44479 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
44480 // vector registers are essentially free. (Technically, widening registers
44481 // prevents load folding, but the tradeoff is worth it.)
44482 bool PreferKOT = Subtarget.preferMaskRegisters();
44483 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
44484
44485 EVT VecVT = MVT::v16i8;
44486 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
44487 if (OpSize == 256) {
44488 VecVT = MVT::v32i8;
44489 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
44490 }
44491 EVT CastVT = VecVT;
44492 bool NeedsAVX512FCast = false;
44493 if (OpSize == 512 || NeedZExt) {
44494 if (Subtarget.hasBWI()) {
44495 VecVT = MVT::v64i8;
44496 CmpVT = MVT::v64i1;
44497 if (OpSize == 512)
44498 CastVT = VecVT;
44499 } else {
44500 VecVT = MVT::v16i32;
44501 CmpVT = MVT::v16i1;
44502 CastVT = OpSize == 512 ? VecVT :
44503 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
44504 NeedsAVX512FCast = true;
44505 }
44506 }
44507
44508 auto ScalarToVector = [&](SDValue X) -> SDValue {
44509 bool TmpZext = false;
44510 EVT TmpCastVT = CastVT;
44511 if (X.getOpcode() == ISD::ZERO_EXTEND) {
44512 SDValue OrigX = X.getOperand(0);
44513 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
44514 if (OrigSize < OpSize) {
44515 if (OrigSize == 128) {
44516 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
44517 X = OrigX;
44518 TmpZext = true;
44519 } else if (OrigSize == 256) {
44520 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
44521 X = OrigX;
44522 TmpZext = true;
44523 }
44524 }
44525 }
44526 X = DAG.getBitcast(TmpCastVT, X);
44527 if (!NeedZExt && !TmpZext)
44528 return X;
44529 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
44530 DAG.getConstant(0, DL, VecVT), X,
44531 DAG.getVectorIdxConstant(0, DL));
44532 };
44533
44534 SDValue Cmp;
44535 if (IsOrXorXorTreeCCZero) {
44536 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
44537 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
44538 // Use 2 vector equality compares and 'and' the results before doing a
44539 // MOVMSK.
44540 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
44541 } else {
44542 SDValue VecX = ScalarToVector(X);
44543 SDValue VecY = ScalarToVector(Y);
44544 if (VecVT != CmpVT) {
44545 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
44546 } else if (HasPT) {
44547 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
44548 } else {
44549 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
44550 }
44551 }
44552 // AVX512 should emit a setcc that will lower to kortest.
44553 if (VecVT != CmpVT) {
44554 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
44555 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
44556 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
44557 DAG.getConstant(0, DL, KRegVT), CC);
44558 }
44559 if (HasPT) {
44560 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
44561 Cmp);
44562 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
44563 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
44564 SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
44565 return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
44566 }
44567 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
44568 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
44569 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
44570 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
44571 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
44572 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
44573 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
44574 MVT::i32);
44575 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
44576 }
44577
44578 return SDValue();
44579}
44580
44581static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
44582 const X86Subtarget &Subtarget) {
44583 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
44584 const SDValue LHS = N->getOperand(0);
44585 const SDValue RHS = N->getOperand(1);
44586 EVT VT = N->getValueType(0);
44587 EVT OpVT = LHS.getValueType();
44588 SDLoc DL(N);
44589
44590 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
44591 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
44592 return V;
44593 }
44594
44595 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44596 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
44597 // Using temporaries to avoid messing up operand ordering for later
44598 // transformations if this doesn't work.
44599 SDValue Op0 = LHS;
44600 SDValue Op1 = RHS;
44601 ISD::CondCode TmpCC = CC;
44602 // Put build_vector on the right.
44603 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
44604 std::swap(Op0, Op1);
44605 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
44606 }
44607
44608 bool IsSEXT0 =
44609 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
44610 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
44611 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
44612
44613 if (IsSEXT0 && IsVZero1) {
44614 assert(VT == Op0.getOperand(0).getValueType() &&((VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44615, __PRETTY_FUNCTION__))
44615 "Uexpected operand type")((VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44615, __PRETTY_FUNCTION__))
;
44616 if (TmpCC == ISD::SETGT)
44617 return DAG.getConstant(0, DL, VT);
44618 if (TmpCC == ISD::SETLE)
44619 return DAG.getConstant(1, DL, VT);
44620 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
44621 return DAG.getNOT(DL, Op0.getOperand(0), VT);
44622
44623 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44624, __PRETTY_FUNCTION__))
44624 "Unexpected condition code!")(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44624, __PRETTY_FUNCTION__))
;
44625 return Op0.getOperand(0);
44626 }
44627 }
44628
44629 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
44630 // pre-promote its result type since vXi1 vectors don't get promoted
44631 // during type legalization.
44632 // NOTE: The element count check is to ignore operand types that need to
44633 // go through type promotion to a 128-bit vector.
44634 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
44635 VT.getVectorElementType() == MVT::i1 &&
44636 (OpVT.getVectorElementType() == MVT::i8 ||
44637 OpVT.getVectorElementType() == MVT::i16)) {
44638 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
44639 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
44640 }
44641
44642 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
44643 // to avoid scalarization via legalization because v4i32 is not a legal type.
44644 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
44645 LHS.getValueType() == MVT::v4f32)
44646 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
44647
44648 return SDValue();
44649}
44650
44651static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
44652 TargetLowering::DAGCombinerInfo &DCI,
44653 const X86Subtarget &Subtarget) {
44654 SDValue Src = N->getOperand(0);
44655 MVT SrcVT = Src.getSimpleValueType();
44656 MVT VT = N->getSimpleValueType(0);
44657 unsigned NumBits = VT.getScalarSizeInBits();
44658 unsigned NumElts = SrcVT.getVectorNumElements();
44659
44660 // Perform constant folding.
44661 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
44662 assert(VT == MVT::i32 && "Unexpected result type")((VT == MVT::i32 && "Unexpected result type") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44662, __PRETTY_FUNCTION__))
;
44663 APInt Imm(32, 0);
44664 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
44665 if (!Src.getOperand(Idx).isUndef() &&
44666 Src.getConstantOperandAPInt(Idx).isNegative())
44667 Imm.setBit(Idx);
44668 }
44669 return DAG.getConstant(Imm, SDLoc(N), VT);
44670 }
44671
44672 // Look through int->fp bitcasts that don't change the element width.
44673 unsigned EltWidth = SrcVT.getScalarSizeInBits();
44674 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
44675 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
44676 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
44677
44678 // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
44679 // with scalar comparisons.
44680 if (SDValue NotSrc = IsNOT(Src, DAG)) {
44681 SDLoc DL(N);
44682 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
44683 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
44684 return DAG.getNode(ISD::XOR, DL, VT,
44685 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
44686 DAG.getConstant(NotMask, DL, VT));
44687 }
44688
44689 // Simplify the inputs.
44690 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44691 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
44692 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
44693 return SDValue(N, 0);
44694
44695 return SDValue();
44696}
44697
44698static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
44699 TargetLowering::DAGCombinerInfo &DCI) {
44700 // With vector masks we only demand the upper bit of the mask.
44701 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
44702 if (Mask.getScalarValueSizeInBits() != 1) {
44703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44704 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
44705 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
44706 DCI.AddToWorklist(N);
44707 return SDValue(N, 0);
44708 }
44709 }
44710
44711 return SDValue();
44712}
44713
44714static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
44715 SDValue Index, SDValue Base, SDValue Scale,
44716 SelectionDAG &DAG) {
44717 SDLoc DL(GorS);
44718
44719 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
44720 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
44721 Gather->getMask(), Base, Index, Scale } ;
44722 return DAG.getMaskedGather(Gather->getVTList(),
44723 Gather->getMemoryVT(), DL, Ops,
44724 Gather->getMemOperand(),
44725 Gather->getIndexType());
44726 }
44727 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
44728 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
44729 Scatter->getMask(), Base, Index, Scale };
44730 return DAG.getMaskedScatter(Scatter->getVTList(),
44731 Scatter->getMemoryVT(), DL,
44732 Ops, Scatter->getMemOperand(),
44733 Scatter->getIndexType());
44734}
44735
44736static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
44737 TargetLowering::DAGCombinerInfo &DCI) {
44738 SDLoc DL(N);
44739 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
44740 SDValue Index = GorS->getIndex();
44741 SDValue Base = GorS->getBasePtr();
44742 SDValue Scale = GorS->getScale();
44743
44744 if (DCI.isBeforeLegalize()) {
44745 unsigned IndexWidth = Index.getScalarValueSizeInBits();
44746
44747 // Shrink constant indices if they are larger than 32-bits.
44748 // Only do this before legalize types since v2i64 could become v2i32.
44749 // FIXME: We could check that the type is legal if we're after legalize
44750 // types, but then we would need to construct test cases where that happens.
44751 // FIXME: We could support more than just constant vectors, but we need to
44752 // careful with costing. A truncate that can be optimized out would be fine.
44753 // Otherwise we might only want to create a truncate if it avoids a split.
44754 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
44755 if (BV->isConstant() && IndexWidth > 32 &&
44756 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
44757 unsigned NumElts = Index.getValueType().getVectorNumElements();
44758 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
44759 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
44760 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44761 }
44762 }
44763
44764 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
44765 // there are sufficient sign bits. Only do this before legalize types to
44766 // avoid creating illegal types in truncate.
44767 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
44768 Index.getOpcode() == ISD::ZERO_EXTEND) &&
44769 IndexWidth > 32 &&
44770 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
44771 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
44772 unsigned NumElts = Index.getValueType().getVectorNumElements();
44773 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
44774 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
44775 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44776 }
44777 }
44778
44779 if (DCI.isBeforeLegalizeOps()) {
44780 unsigned IndexWidth = Index.getScalarValueSizeInBits();
44781
44782 // Make sure the index is either i32 or i64
44783 if (IndexWidth != 32 && IndexWidth != 64) {
44784 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
44785 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
44786 Index.getValueType().getVectorNumElements());
44787 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
44788 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44789 }
44790 }
44791
44792 // With vector masks we only demand the upper bit of the mask.
44793 SDValue Mask = GorS->getMask();
44794 if (Mask.getScalarValueSizeInBits() != 1) {
44795 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44796 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
44797 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
44798 DCI.AddToWorklist(N);
44799 return SDValue(N, 0);
44800 }
44801 }
44802
44803 return SDValue();
44804}
44805
44806// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
44807static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
44808 const X86Subtarget &Subtarget) {
44809 SDLoc DL(N);
44810 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
44811 SDValue EFLAGS = N->getOperand(1);
44812
44813 // Try to simplify the EFLAGS and condition code operands.
44814 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
44815 return getSETCC(CC, Flags, DL, DAG);
44816
44817 return SDValue();
44818}
44819
44820/// Optimize branch condition evaluation.
44821static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
44822 const X86Subtarget &Subtarget) {
44823 SDLoc DL(N);
44824 SDValue EFLAGS = N->getOperand(3);
44825 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
44826
44827 // Try to simplify the EFLAGS and condition code operands.
44828 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
44829 // RAUW them under us.
44830 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
44831 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
44832 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
44833 N->getOperand(1), Cond, Flags);
44834 }
44835
44836 return SDValue();
44837}
44838
44839static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
44840 SelectionDAG &DAG) {
44841 // Take advantage of vector comparisons producing 0 or -1 in each lane to
44842 // optimize away operation when it's from a constant.
44843 //
44844 // The general transformation is:
44845 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
44846 // AND(VECTOR_CMP(x,y), constant2)
44847 // constant2 = UNARYOP(constant)
44848
44849 // Early exit if this isn't a vector operation, the operand of the
44850 // unary operation isn't a bitwise AND, or if the sizes of the operations
44851 // aren't the same.
44852 EVT VT = N->getValueType(0);
44853 bool IsStrict = N->isStrictFPOpcode();
44854 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
44855 if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
44856 Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
44857 VT.getSizeInBits() != Op0.getValueSizeInBits())
44858 return SDValue();
44859
44860 // Now check that the other operand of the AND is a constant. We could
44861 // make the transformation for non-constant splats as well, but it's unclear
44862 // that would be a benefit as it would not eliminate any operations, just
44863 // perform one more step in scalar code before moving to the vector unit.
44864 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
44865 // Bail out if the vector isn't a constant.
44866 if (!BV->isConstant())
44867 return SDValue();
44868
44869 // Everything checks out. Build up the new and improved node.
44870 SDLoc DL(N);
44871 EVT IntVT = BV->getValueType(0);
44872 // Create a new constant of the appropriate type for the transformed
44873 // DAG.
44874 SDValue SourceConst;
44875 if (IsStrict)
44876 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
44877 {N->getOperand(0), SDValue(BV, 0)});
44878 else
44879 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
44880 // The AND node needs bitcasts to/from an integer vector type around it.
44881 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
44882 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
44883 MaskConst);
44884 SDValue Res = DAG.getBitcast(VT, NewAnd);
44885 if (IsStrict)
44886 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
44887 return Res;
44888 }
44889
44890 return SDValue();
44891}
44892
44893/// If we are converting a value to floating-point, try to replace scalar
44894/// truncate of an extracted vector element with a bitcast. This tries to keep
44895/// the sequence on XMM registers rather than moving between vector and GPRs.
44896static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
44897 // TODO: This is currently only used by combineSIntToFP, but it is generalized
44898 // to allow being called by any similar cast opcode.
44899 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
44900 SDValue Trunc = N->getOperand(0);
44901 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
44902 return SDValue();
44903
44904 SDValue ExtElt = Trunc.getOperand(0);
44905 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
44906 !isNullConstant(ExtElt.getOperand(1)))
44907 return SDValue();
44908
44909 EVT TruncVT = Trunc.getValueType();
44910 EVT SrcVT = ExtElt.getValueType();
44911 unsigned DestWidth = TruncVT.getSizeInBits();
44912 unsigned SrcWidth = SrcVT.getSizeInBits();
44913 if (SrcWidth % DestWidth != 0)
44914 return SDValue();
44915
44916 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
44917 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
44918 unsigned VecWidth = SrcVecVT.getSizeInBits();
44919 unsigned NumElts = VecWidth / DestWidth;
44920 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
44921 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
44922 SDLoc DL(N);
44923 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
44924 BitcastVec, ExtElt.getOperand(1));
44925 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
44926}
44927
44928static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
44929 const X86Subtarget &Subtarget) {
44930 bool IsStrict = N->isStrictFPOpcode();
44931 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
44932 EVT VT = N->getValueType(0);
44933 EVT InVT = Op0.getValueType();
44934
44935 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
44936 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
44937 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
44938 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
44939 SDLoc dl(N);
44940 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
44941 InVT.getVectorNumElements());
44942 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
44943
44944 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
44945 if (IsStrict)
44946 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
44947 {N->getOperand(0), P});
44948 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
44949 }
44950
44951 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
44952 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
44953 // the optimization here.
44954 if (DAG.SignBitIsZero(Op0)) {
44955 if (IsStrict)
44956 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
44957 {N->getOperand(0), Op0});
44958 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
44959 }
44960
44961 return SDValue();
44962}
44963
44964static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
44965 TargetLowering::DAGCombinerInfo &DCI,
44966 const X86Subtarget &Subtarget) {
44967 // First try to optimize away the conversion entirely when it's
44968 // conditionally from a constant. Vectors only.
44969 bool IsStrict = N->isStrictFPOpcode();
44970 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
44971 return Res;
44972
44973 // Now move on to more general possibilities.
44974 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
44975 EVT VT = N->getValueType(0);
44976 EVT InVT = Op0.getValueType();
44977
44978 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
44979 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
44980 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
44981 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
44982 SDLoc dl(N);
44983 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
44984 InVT.getVectorNumElements());
44985 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
44986 if (IsStrict)
44987 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
44988 {N->getOperand(0), P});
44989 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
44990 }
44991
44992 // Without AVX512DQ we only support i64 to float scalar conversion. For both
44993 // vectors and scalars, see if we know that the upper bits are all the sign
44994 // bit, in which case we can truncate the input to i32 and convert from that.
44995 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
44996 unsigned BitWidth = InVT.getScalarSizeInBits();
44997 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
44998 if (NumSignBits >= (BitWidth - 31)) {
44999 EVT TruncVT = MVT::i32;
45000 if (InVT.isVector())
45001 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
45002 InVT.getVectorNumElements());
45003 SDLoc dl(N);
45004 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
45005 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
45006 if (IsStrict)
45007 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
45008 {N->getOperand(0), Trunc});
45009 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
45010 }
45011 // If we're after legalize and the type is v2i32 we need to shuffle and
45012 // use CVTSI2P.
45013 assert(InVT == MVT::v2i64 && "Unexpected VT!")((InVT == MVT::v2i64 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45013, __PRETTY_FUNCTION__))
;
45014 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
45015 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
45016 { 0, 2, -1, -1 });
45017 if (IsStrict)
45018 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
45019 {N->getOperand(0), Shuf});
45020 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
45021 }
45022 }
45023
45024 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
45025 // a 32-bit target where SSE doesn't support i64->FP operations.
45026 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
45027 Op0.getOpcode() == ISD::LOAD) {
45028 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
45029 EVT LdVT = Ld->getValueType(0);
45030
45031 // This transformation is not supported if the result type is f16 or f128.
45032 if (VT == MVT::f16 || VT == MVT::f128)
45033 return SDValue();
45034
45035 // If we have AVX512DQ we can use packed conversion instructions unless
45036 // the VT is f80.
45037 if (Subtarget.hasDQI() && VT != MVT::f80)
45038 return SDValue();
45039
45040 if (Ld->isSimple() && !VT.isVector() &&
45041 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
45042 !Subtarget.is64Bit() && LdVT == MVT::i64) {
45043 std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
45044 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
45045 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
45046 return Tmp.first;
45047 }
45048 }
45049
45050 if (IsStrict)
45051 return SDValue();
45052
45053 if (SDValue V = combineToFPTruncExtElt(N, DAG))
45054 return V;
45055
45056 return SDValue();
45057}
45058
45059static bool needCarryOrOverflowFlag(SDValue Flags) {
45060 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45060, __PRETTY_FUNCTION__))
;
45061
45062 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
45063 UI != UE; ++UI) {
45064 SDNode *User = *UI;
45065
45066 X86::CondCode CC;
45067 switch (User->getOpcode()) {
45068 default:
45069 // Be conservative.
45070 return true;
45071 case X86ISD::SETCC:
45072 case X86ISD::SETCC_CARRY:
45073 CC = (X86::CondCode)User->getConstantOperandVal(0);
45074 break;
45075 case X86ISD::BRCOND:
45076 CC = (X86::CondCode)User->getConstantOperandVal(2);
45077 break;
45078 case X86ISD::CMOV:
45079 CC = (X86::CondCode)User->getConstantOperandVal(2);
45080 break;
45081 }
45082
45083 switch (CC) {
45084 default: break;
45085 case X86::COND_A: case X86::COND_AE:
45086 case X86::COND_B: case X86::COND_BE:
45087 case X86::COND_O: case X86::COND_NO:
45088 case X86::COND_G: case X86::COND_GE:
45089 case X86::COND_L: case X86::COND_LE:
45090 return true;
45091 }
45092 }
45093
45094 return false;
45095}
45096
45097static bool onlyZeroFlagUsed(SDValue Flags) {
45098 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45098, __PRETTY_FUNCTION__))
;
45099
45100 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
45101 UI != UE; ++UI) {
45102 SDNode *User = *UI;
45103
45104 unsigned CCOpNo;
45105 switch (User->getOpcode()) {
45106 default:
45107 // Be conservative.
45108 return false;
45109 case X86ISD::SETCC: CCOpNo = 0; break;
45110 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
45111 case X86ISD::BRCOND: CCOpNo = 2; break;
45112 case X86ISD::CMOV: CCOpNo = 2; break;
45113 }
45114
45115 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
45116 if (CC != X86::COND_E && CC != X86::COND_NE)
45117 return false;
45118 }
45119
45120 return true;
45121}
45122
45123static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
45124 // Only handle test patterns.
45125 if (!isNullConstant(N->getOperand(1)))
45126 return SDValue();
45127
45128 // If we have a CMP of a truncated binop, see if we can make a smaller binop
45129 // and use its flags directly.
45130 // TODO: Maybe we should try promoting compares that only use the zero flag
45131 // first if we can prove the upper bits with computeKnownBits?
45132 SDLoc dl(N);
45133 SDValue Op = N->getOperand(0);
45134 EVT VT = Op.getValueType();
45135
45136 // If we have a constant logical shift that's only used in a comparison
45137 // against zero turn it into an equivalent AND. This allows turning it into
45138 // a TEST instruction later.
45139 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
45140 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
45141 onlyZeroFlagUsed(SDValue(N, 0))) {
45142 unsigned BitWidth = VT.getSizeInBits();
45143 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
45144 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
45145 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
45146 APInt Mask = Op.getOpcode() == ISD::SRL
45147 ? APInt::getHighBitsSet(BitWidth, MaskBits)
45148 : APInt::getLowBitsSet(BitWidth, MaskBits);
45149 if (Mask.isSignedIntN(32)) {
45150 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
45151 DAG.getConstant(Mask, dl, VT));
45152 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
45153 DAG.getConstant(0, dl, VT));
45154 }
45155 }
45156 }
45157
45158 // Look for a truncate with a single use.
45159 if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
45160 return SDValue();
45161
45162 Op = Op.getOperand(0);
45163
45164 // Arithmetic op can only have one use.
45165 if (!Op.hasOneUse())
45166 return SDValue();
45167
45168 unsigned NewOpc;
45169 switch (Op.getOpcode()) {
45170 default: return SDValue();
45171 case ISD::AND:
45172 // Skip and with constant. We have special handling for and with immediate
45173 // during isel to generate test instructions.
45174 if (isa<ConstantSDNode>(Op.getOperand(1)))
45175 return SDValue();
45176 NewOpc = X86ISD::AND;
45177 break;
45178 case ISD::OR: NewOpc = X86ISD::OR; break;
45179 case ISD::XOR: NewOpc = X86ISD::XOR; break;
45180 case ISD::ADD:
45181 // If the carry or overflow flag is used, we can't truncate.
45182 if (needCarryOrOverflowFlag(SDValue(N, 0)))
45183 return SDValue();
45184 NewOpc = X86ISD::ADD;
45185 break;
45186 case ISD::SUB:
45187 // If the carry or overflow flag is used, we can't truncate.
45188 if (needCarryOrOverflowFlag(SDValue(N, 0)))
45189 return SDValue();
45190 NewOpc = X86ISD::SUB;
45191 break;
45192 }
45193
45194 // We found an op we can narrow. Truncate its inputs.
45195 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
45196 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
45197
45198 // Use a X86 specific opcode to avoid DAG combine messing with it.
45199 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45200 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
45201
45202 // For AND, keep a CMP so that we can match the test pattern.
45203 if (NewOpc == X86ISD::AND)
45204 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
45205 DAG.getConstant(0, dl, VT));
45206
45207 // Return the flags.
45208 return Op.getValue(1);
45209}
45210
45211static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
45212 TargetLowering::DAGCombinerInfo &DCI) {
45213 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45214, __PRETTY_FUNCTION__))
45214 "Expected X86ISD::ADD or X86ISD::SUB")(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45214, __PRETTY_FUNCTION__))
;
45215
45216 SDLoc DL(N);
45217 SDValue LHS = N->getOperand(0);
45218 SDValue RHS = N->getOperand(1);
45219 MVT VT = LHS.getSimpleValueType();
45220 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
45221
45222 // If we don't use the flag result, simplify back to a generic ADD/SUB.
45223 if (!N->hasAnyUseOfValue(1)) {
45224 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
45225 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
45226 }
45227
45228 // Fold any similar generic ADD/SUB opcodes to reuse this node.
45229 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
45230 SDValue Ops[] = {N0, N1};
45231 SDVTList VTs = DAG.getVTList(N->getValueType(0));
45232 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
45233 SDValue Op(N, 0);
45234 if (Negate)
45235 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
45236 DCI.CombineTo(GenericAddSub, Op);
45237 }
45238 };
45239 MatchGeneric(LHS, RHS, false);
45240 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
45241
45242 return SDValue();
45243}
45244
45245static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
45246 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
45247 MVT VT = N->getSimpleValueType(0);
45248 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45249 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
45250 N->getOperand(0), N->getOperand(1),
45251 Flags);
45252 }
45253
45254 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
45255 // iff the flag result is dead.
45256 SDValue Op0 = N->getOperand(0);
45257 SDValue Op1 = N->getOperand(1);
45258 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
45259 !N->hasAnyUseOfValue(1))
45260 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
45261 Op0.getOperand(1), N->getOperand(2));
45262
45263 return SDValue();
45264}
45265
45266// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
45267static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
45268 TargetLowering::DAGCombinerInfo &DCI) {
45269 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
45270 // the result is either zero or one (depending on the input carry bit).
45271 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
45272 if (X86::isZeroNode(N->getOperand(0)) &&
45273 X86::isZeroNode(N->getOperand(1)) &&
45274 // We don't have a good way to replace an EFLAGS use, so only do this when
45275 // dead right now.
45276 SDValue(N, 1).use_empty()) {
45277 SDLoc DL(N);
45278 EVT VT = N->getValueType(0);
45279 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
45280 SDValue Res1 =
45281 DAG.getNode(ISD::AND, DL, VT,
45282 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45283 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45284 N->getOperand(2)),
45285 DAG.getConstant(1, DL, VT));
45286 return DCI.CombineTo(N, Res1, CarryOut);
45287 }
45288
45289 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
45290 MVT VT = N->getSimpleValueType(0);
45291 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45292 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
45293 N->getOperand(0), N->getOperand(1),
45294 Flags);
45295 }
45296
45297 return SDValue();
45298}
45299
45300/// If this is an add or subtract where one operand is produced by a cmp+setcc,
45301/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
45302/// with CMP+{ADC, SBB}.
45303static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
45304 bool IsSub = N->getOpcode() == ISD::SUB;
45305 SDValue X = N->getOperand(0);
45306 SDValue Y = N->getOperand(1);
45307
45308 // If this is an add, canonicalize a zext operand to the RHS.
45309 // TODO: Incomplete? What if both sides are zexts?
45310 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
45311 Y.getOpcode() != ISD::ZERO_EXTEND)
45312 std::swap(X, Y);
45313
45314 // Look through a one-use zext.
45315 bool PeekedThroughZext = false;
45316 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
45317 Y = Y.getOperand(0);
45318 PeekedThroughZext = true;
45319 }
45320
45321 // If this is an add, canonicalize a setcc operand to the RHS.
45322 // TODO: Incomplete? What if both sides are setcc?
45323 // TODO: Should we allow peeking through a zext of the other operand?
45324 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
45325 Y.getOpcode() != X86ISD::SETCC)
45326 std::swap(X, Y);
45327
45328 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
45329 return SDValue();
45330
45331 SDLoc DL(N);
45332 EVT VT = N->getValueType(0);
45333 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
45334
45335 // If X is -1 or 0, then we have an opportunity to avoid constants required in
45336 // the general case below.
45337 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
45338 if (ConstantX) {
45339 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
45340 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
45341 // This is a complicated way to get -1 or 0 from the carry flag:
45342 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
45343 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
45344 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45345 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45346 Y.getOperand(1));
45347 }
45348
45349 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
45350 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
45351 SDValue EFLAGS = Y->getOperand(1);
45352 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
45353 EFLAGS.getValueType().isInteger() &&
45354 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
45355 // Swap the operands of a SUB, and we have the same pattern as above.
45356 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
45357 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
45358 SDValue NewSub = DAG.getNode(
45359 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
45360 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
45361 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
45362 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45363 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45364 NewEFLAGS);
45365 }
45366 }
45367 }
45368
45369 if (CC == X86::COND_B) {
45370 // X + SETB Z --> adc X, 0
45371 // X - SETB Z --> sbb X, 0
45372 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
45373 DAG.getVTList(VT, MVT::i32), X,
45374 DAG.getConstant(0, DL, VT), Y.getOperand(1));
45375 }
45376
45377 if (CC == X86::COND_A) {
45378 SDValue EFLAGS = Y->getOperand(1);
45379 // Try to convert COND_A into COND_B in an attempt to facilitate
45380 // materializing "setb reg".
45381 //
45382 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
45383 // cannot take an immediate as its first operand.
45384 //
45385 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
45386 EFLAGS.getValueType().isInteger() &&
45387 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
45388 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
45389 EFLAGS.getNode()->getVTList(),
45390 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
45391 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
45392 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
45393 DAG.getVTList(VT, MVT::i32), X,
45394 DAG.getConstant(0, DL, VT), NewEFLAGS);
45395 }
45396 }
45397
45398 if (CC != X86::COND_E && CC != X86::COND_NE)
45399 return SDValue();
45400
45401 SDValue Cmp = Y.getOperand(1);
45402 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
45403 !X86::isZeroNode(Cmp.getOperand(1)) ||
45404 !Cmp.getOperand(0).getValueType().isInteger())
45405 return SDValue();
45406
45407 SDValue Z = Cmp.getOperand(0);
45408 EVT ZVT = Z.getValueType();
45409
45410 // If X is -1 or 0, then we have an opportunity to avoid constants required in
45411 // the general case below.
45412 if (ConstantX) {
45413 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
45414 // fake operands:
45415 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
45416 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
45417 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
45418 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
45419 SDValue Zero = DAG.getConstant(0, DL, ZVT);
45420 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45421 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
45422 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45423 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45424 SDValue(Neg.getNode(), 1));
45425 }
45426
45427 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
45428 // with fake operands:
45429 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
45430 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
45431 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
45432 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
45433 SDValue One = DAG.getConstant(1, DL, ZVT);
45434 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45435 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
45436 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45437 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45438 Cmp1.getValue(1));
45439 }
45440 }
45441
45442 // (cmp Z, 1) sets the carry flag if Z is 0.
45443 SDValue One = DAG.getConstant(1, DL, ZVT);
45444 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45445 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
45446
45447 // Add the flags type for ADC/SBB nodes.
45448 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45449
45450 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
45451 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
45452 if (CC == X86::COND_NE)
45453 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
45454 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
45455
45456 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
45457 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
45458 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
45459 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
45460}
45461
45462static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
45463 const X86Subtarget &Subtarget) {
45464 if (!Subtarget.hasSSE2())
45465 return SDValue();
45466
45467 EVT VT = N->getValueType(0);
45468
45469 // If the vector size is less than 128, or greater than the supported RegSize,
45470 // do not use PMADD.
45471 if (!VT.isVector() || VT.getVectorNumElements() < 8)
45472 return SDValue();
45473
45474 SDValue Op0 = N->getOperand(0);
45475 SDValue Op1 = N->getOperand(1);
45476
45477 auto UsePMADDWD = [&](SDValue Op) {
45478 ShrinkMode Mode;
45479 return Op.getOpcode() == ISD::MUL &&
45480 canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
45481 Mode != ShrinkMode::MULU16 &&
45482 (!Subtarget.hasSSE41() ||
45483 (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
45484 Op->isOnlyUserOf(Op.getOperand(1).getNode())));
45485 };
45486
45487 SDValue MulOp, OtherOp;
45488 if (UsePMADDWD(Op0)) {
45489 MulOp = Op0;
45490 OtherOp = Op1;
45491 } else if (UsePMADDWD(Op1)) {
45492 MulOp = Op1;
45493 OtherOp = Op0;
45494 } else
45495 return SDValue();
45496
45497 SDLoc DL(N);
45498 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
45499 VT.getVectorNumElements());
45500 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45501 VT.getVectorNumElements() / 2);
45502
45503 // Shrink the operands of mul.
45504 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
45505 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
45506
45507 // Madd vector size is half of the original vector size
45508 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45509 ArrayRef<SDValue> Ops) {
45510 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45511 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
45512 };
45513 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
45514 PMADDWDBuilder);
45515 // Fill the rest of the output with 0
45516 SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
45517 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
45518
45519 // Preserve the reduction flag on the ADD. We may need to revisit for the
45520 // other operand.
45521 SDNodeFlags Flags;
45522 Flags.setVectorReduction(true);
45523 return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
45524}
45525
45526static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
45527 const X86Subtarget &Subtarget) {
45528 if (!Subtarget.hasSSE2())
45529 return SDValue();
45530
45531 SDLoc DL(N);
45532 EVT VT = N->getValueType(0);
45533
45534 // TODO: There's nothing special about i32, any integer type above i16 should
45535 // work just as well.
45536 if (!VT.isVector() || !VT.isSimple() ||
45537 !(VT.getVectorElementType() == MVT::i32))
45538 return SDValue();
45539
45540 unsigned RegSize = 128;
45541 if (Subtarget.useBWIRegs())
45542 RegSize = 512;
45543 else if (Subtarget.hasAVX())
45544 RegSize = 256;
45545
45546 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
45547 // TODO: We should be able to handle larger vectors by splitting them before
45548 // feeding them into several SADs, and then reducing over those.
45549 if (VT.getSizeInBits() / 4 > RegSize)
45550 return SDValue();
45551
45552 // We know N is a reduction add. To match SAD, we need one of the operands to
45553 // be an ABS.
45554 SDValue AbsOp = N->getOperand(0);
45555 SDValue OtherOp = N->getOperand(1);
45556 if (AbsOp.getOpcode() != ISD::ABS)
45557 std::swap(AbsOp, OtherOp);
45558 if (AbsOp.getOpcode() != ISD::ABS)
45559 return SDValue();
45560
45561 // Check whether we have an abs-diff pattern feeding into the select.
45562 SDValue SadOp0, SadOp1;
45563 if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
45564 return SDValue();
45565
45566 // SAD pattern detected. Now build a SAD instruction and an addition for
45567 // reduction. Note that the number of elements of the result of SAD is less
45568 // than the number of elements of its input. Therefore, we could only update
45569 // part of elements in the reduction vector.
45570 SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
45571
45572 // The output of PSADBW is a vector of i64.
45573 // We need to turn the vector of i64 into a vector of i32.
45574 // If the reduction vector is at least as wide as the psadbw result, just
45575 // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
45576 // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
45577 // result to v2i32 which will be removed by type legalization. If we/ widen
45578 // narrow vectors then we bitcast to v4i32 and extract v2i32.
45579 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
45580 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
45581
45582 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
45583 // Fill the upper elements with zero to match the add width.
45584 assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs")((VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45584, __PRETTY_FUNCTION__))
;
45585 unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
45586 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
45587 Ops[0] = Sad;
45588 Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
45589 } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
45590 Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
45591 DAG.getIntPtrConstant(0, DL));
45592 }
45593
45594 // Preserve the reduction flag on the ADD. We may need to revisit for the
45595 // other operand.
45596 SDNodeFlags Flags;
45597 Flags.setVectorReduction(true);
45598 return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
45599}
45600
45601static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
45602 const SDLoc &DL, EVT VT,
45603 const X86Subtarget &Subtarget) {
45604 // Example of pattern we try to detect:
45605 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
45606 //(add (build_vector (extract_elt t, 0),
45607 // (extract_elt t, 2),
45608 // (extract_elt t, 4),
45609 // (extract_elt t, 6)),
45610 // (build_vector (extract_elt t, 1),
45611 // (extract_elt t, 3),
45612 // (extract_elt t, 5),
45613 // (extract_elt t, 7)))
45614
45615 if (!Subtarget.hasSSE2())
45616 return SDValue();
45617
45618 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
45619 Op1.getOpcode() != ISD::BUILD_VECTOR)
45620 return SDValue();
45621
45622 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
45623 VT.getVectorNumElements() < 4 ||
45624 !isPowerOf2_32(VT.getVectorNumElements()))
45625 return SDValue();
45626
45627 // Check if one of Op0,Op1 is of the form:
45628 // (build_vector (extract_elt Mul, 0),
45629 // (extract_elt Mul, 2),
45630 // (extract_elt Mul, 4),
45631 // ...
45632 // the other is of the form:
45633 // (build_vector (extract_elt Mul, 1),
45634 // (extract_elt Mul, 3),
45635 // (extract_elt Mul, 5),
45636 // ...
45637 // and identify Mul.
45638 SDValue Mul;
45639 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
45640 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
45641 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
45642 // TODO: Be more tolerant to undefs.
45643 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45644 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45645 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45646 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
45647 return SDValue();
45648 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
45649 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
45650 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
45651 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
45652 if (!Const0L || !Const1L || !Const0H || !Const1H)
45653 return SDValue();
45654 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
45655 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
45656 // Commutativity of mul allows factors of a product to reorder.
45657 if (Idx0L > Idx1L)
45658 std::swap(Idx0L, Idx1L);
45659 if (Idx0H > Idx1H)
45660 std::swap(Idx0H, Idx1H);
45661 // Commutativity of add allows pairs of factors to reorder.
45662 if (Idx0L > Idx0H) {
45663 std::swap(Idx0L, Idx0H);
45664 std::swap(Idx1L, Idx1H);
45665 }
45666 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
45667 Idx1H != 2 * i + 3)
45668 return SDValue();
45669 if (!Mul) {
45670 // First time an extract_elt's source vector is visited. Must be a MUL
45671 // with 2X number of vector elements than the BUILD_VECTOR.
45672 // Both extracts must be from same MUL.
45673 Mul = Op0L->getOperand(0);
45674 if (Mul->getOpcode() != ISD::MUL ||
45675 Mul.getValueType().getVectorNumElements() != 2 * e)
45676 return SDValue();
45677 }
45678 // Check that the extract is from the same MUL previously seen.
45679 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
45680 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
45681 return SDValue();
45682 }
45683
45684 // Check if the Mul source can be safely shrunk.
45685 ShrinkMode Mode;
45686 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
45687 Mode == ShrinkMode::MULU16)
45688 return SDValue();
45689
45690 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45691 ArrayRef<SDValue> Ops) {
45692 // Shrink by adding truncate nodes and let DAGCombine fold with the
45693 // sources.
45694 EVT InVT = Ops[0].getValueType();
45695 assert(InVT.getScalarType() == MVT::i32 &&((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45696, __PRETTY_FUNCTION__))
45696 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45696, __PRETTY_FUNCTION__))
;
45697 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45697, __PRETTY_FUNCTION__))
;
45698 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45699 InVT.getVectorNumElements() / 2);
45700 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
45701 InVT.getVectorNumElements());
45702 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
45703 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
45704 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
45705 };
45706 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
45707 { Mul.getOperand(0), Mul.getOperand(1) },
45708 PMADDBuilder);
45709}
45710
45711// Attempt to turn this pattern into PMADDWD.
45712// (mul (add (sext (build_vector)), (sext (build_vector))),
45713// (add (sext (build_vector)), (sext (build_vector)))
45714static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
45715 const SDLoc &DL, EVT VT,
45716 const X86Subtarget &Subtarget) {
45717 if (!Subtarget.hasSSE2())
45718 return SDValue();
45719
45720 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
45721 return SDValue();
45722
45723 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
45724 VT.getVectorNumElements() < 4 ||
45725 !isPowerOf2_32(VT.getVectorNumElements()))
45726 return SDValue();
45727
45728 SDValue N00 = N0.getOperand(0);
45729 SDValue N01 = N0.getOperand(1);
45730 SDValue N10 = N1.getOperand(0);
45731 SDValue N11 = N1.getOperand(1);
45732
45733 // All inputs need to be sign extends.
45734 // TODO: Support ZERO_EXTEND from known positive?
45735 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
45736 N01.getOpcode() != ISD::SIGN_EXTEND ||
45737 N10.getOpcode() != ISD::SIGN_EXTEND ||
45738 N11.getOpcode() != ISD::SIGN_EXTEND)
45739 return SDValue();
45740
45741 // Peek through the extends.
45742 N00 = N00.getOperand(0);
45743 N01 = N01.getOperand(0);
45744 N10 = N10.getOperand(0);
45745 N11 = N11.getOperand(0);
45746
45747 // Must be extending from vXi16.
45748 EVT InVT = N00.getValueType();
45749 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
45750 N10.getValueType() != InVT || N11.getValueType() != InVT)
45751 return SDValue();
45752
45753 // All inputs should be build_vectors.
45754 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
45755 N01.getOpcode() != ISD::BUILD_VECTOR ||
45756 N10.getOpcode() != ISD::BUILD_VECTOR ||
45757 N11.getOpcode() != ISD::BUILD_VECTOR)
45758 return SDValue();
45759
45760 // For each element, we need to ensure we have an odd element from one vector
45761 // multiplied by the odd element of another vector and the even element from
45762 // one of the same vectors being multiplied by the even element from the
45763 // other vector. So we need to make sure for each element i, this operator
45764 // is being performed:
45765 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
45766 SDValue In0, In1;
45767 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
45768 SDValue N00Elt = N00.getOperand(i);
45769 SDValue N01Elt = N01.getOperand(i);
45770 SDValue N10Elt = N10.getOperand(i);
45771 SDValue N11Elt = N11.getOperand(i);
45772 // TODO: Be more tolerant to undefs.
45773 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45774 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45775 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45776 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
45777 return SDValue();
45778 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
45779 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
45780 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
45781 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
45782 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
45783 return SDValue();
45784 unsigned IdxN00 = ConstN00Elt->getZExtValue();
45785 unsigned IdxN01 = ConstN01Elt->getZExtValue();
45786 unsigned IdxN10 = ConstN10Elt->getZExtValue();
45787 unsigned IdxN11 = ConstN11Elt->getZExtValue();
45788 // Add is commutative so indices can be reordered.
45789 if (IdxN00 > IdxN10) {
45790 std::swap(IdxN00, IdxN10);
45791 std::swap(IdxN01, IdxN11);
45792 }
45793 // N0 indices be the even element. N1 indices must be the next odd element.
45794 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
45795 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
45796 return SDValue();
45797 SDValue N00In = N00Elt.getOperand(0);
45798 SDValue N01In = N01Elt.getOperand(0);
45799 SDValue N10In = N10Elt.getOperand(0);
45800 SDValue N11In = N11Elt.getOperand(0);
45801 // First time we find an input capture it.
45802 if (!In0) {
45803 In0 = N00In;
45804 In1 = N01In;
45805 }
45806 // Mul is commutative so the input vectors can be in any order.
45807 // Canonicalize to make the compares easier.
45808 if (In0 != N00In)
45809 std::swap(N00In, N01In);
45810 if (In0 != N10In)
45811 std::swap(N10In, N11In);
45812 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
45813 return SDValue();
45814 }
45815
45816 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45817 ArrayRef<SDValue> Ops) {
45818 // Shrink by adding truncate nodes and let DAGCombine fold with the
45819 // sources.
45820 EVT OpVT = Ops[0].getValueType();
45821 assert(OpVT.getScalarType() == MVT::i16 &&((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45822, __PRETTY_FUNCTION__))
45822 "Unexpected scalar element type")((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45822, __PRETTY_FUNCTION__))
;
45823 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((OpVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45823, __PRETTY_FUNCTION__))
;
45824 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45825 OpVT.getVectorNumElements() / 2);
45826 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
45827 };
45828 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
45829 PMADDBuilder);
45830}
45831
45832static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
45833 TargetLowering::DAGCombinerInfo &DCI,
45834 const X86Subtarget &Subtarget) {
45835 const SDNodeFlags Flags = N->getFlags();
45836 if (Flags.hasVectorReduction()) {
45837 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
45838 return Sad;
45839 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
45840 return MAdd;
45841 }
45842 EVT VT = N->getValueType(0);
45843 SDValue Op0 = N->getOperand(0);
45844 SDValue Op1 = N->getOperand(1);
45845
45846 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
45847 return MAdd;
45848 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
45849 return MAdd;
45850
45851 // Try to synthesize horizontal adds from adds of shuffles.
45852 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
45853 VT == MVT::v8i32) &&
45854 Subtarget.hasSSSE3() &&
45855 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
45856 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45857 ArrayRef<SDValue> Ops) {
45858 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
45859 };
45860 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
45861 HADDBuilder);
45862 }
45863
45864 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
45865 // (sub Y, (sext (vXi1 X))).
45866 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
45867 // generic DAG combine without a legal type check, but adding this there
45868 // caused regressions.
45869 if (VT.isVector()) {
45870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45871 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
45872 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
45873 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
45874 SDLoc DL(N);
45875 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
45876 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
45877 }
45878
45879 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
45880 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
45881 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
45882 SDLoc DL(N);
45883 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
45884 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
45885 }
45886 }
45887
45888 return combineAddOrSubToADCOrSBB(N, DAG);
45889}
45890
45891static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
45892 const X86Subtarget &Subtarget) {
45893 SDValue Op0 = N->getOperand(0);
45894 SDValue Op1 = N->getOperand(1);
45895 EVT VT = N->getValueType(0);
45896
45897 if (!VT.isVector())
45898 return SDValue();
45899
45900 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
45901 // is only worth it with SSSE3 (PSHUFB).
45902 EVT EltVT = VT.getVectorElementType();
45903 if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
45904 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
45905 !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
45906 return SDValue();
45907
45908 SDValue SubusLHS, SubusRHS;
45909 // Try to find umax(a,b) - b or a - umin(a,b) patterns
45910 // they may be converted to subus(a,b).
45911 // TODO: Need to add IR canonicalization for this code.
45912 if (Op0.getOpcode() == ISD::UMAX) {
45913 SubusRHS = Op1;
45914 SDValue MaxLHS = Op0.getOperand(0);
45915 SDValue MaxRHS = Op0.getOperand(1);
45916 if (MaxLHS == Op1)
45917 SubusLHS = MaxRHS;
45918 else if (MaxRHS == Op1)
45919 SubusLHS = MaxLHS;
45920 else
45921 return SDValue();
45922 } else if (Op1.getOpcode() == ISD::UMIN) {
45923 SubusLHS = Op0;
45924 SDValue MinLHS = Op1.getOperand(0);
45925 SDValue MinRHS = Op1.getOperand(1);
45926 if (MinLHS == Op0)
45927 SubusRHS = MinRHS;
45928 else if (MinRHS == Op0)
45929 SubusRHS = MinLHS;
45930 else
45931 return SDValue();
45932 } else
45933 return SDValue();
45934
45935 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
45936 // special preprocessing in some cases.
45937 if (EltVT == MVT::i8 || EltVT == MVT::i16)
45938 return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
45939
45940 assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45941, __PRETTY_FUNCTION__))
45941 "Unexpected VT!")(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45941, __PRETTY_FUNCTION__))
;
45942
45943 // Special preprocessing case can be only applied
45944 // if the value was zero extended from 16 bit,
45945 // so we require first 16 bits to be zeros for 32 bit
45946 // values, or first 48 bits for 64 bit values.
45947 KnownBits Known = DAG.computeKnownBits(SubusLHS);
45948 unsigned NumZeros = Known.countMinLeadingZeros();
45949 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
45950 return SDValue();
45951
45952 EVT ExtType = SubusLHS.getValueType();
45953 EVT ShrinkedType;
45954 if (VT == MVT::v8i32 || VT == MVT::v8i64)
45955 ShrinkedType = MVT::v8i16;
45956 else
45957 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
45958
45959 // If SubusLHS is zeroextended - truncate SubusRHS to it's
45960 // size SubusRHS = umin(0xFFF.., SubusRHS).
45961 SDValue SaturationConst =
45962 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
45963 ShrinkedType.getScalarSizeInBits()),
45964 SDLoc(SubusLHS), ExtType);
45965 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
45966 SaturationConst);
45967 SDValue NewSubusLHS =
45968 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
45969 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
45970 SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
45971 NewSubusLHS, NewSubusRHS);
45972
45973 // Zero extend the result, it may be used somewhere as 32 bit,
45974 // if not zext and following trunc will shrink.
45975 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
45976}
45977
45978static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
45979 TargetLowering::DAGCombinerInfo &DCI,
45980 const X86Subtarget &Subtarget) {
45981 SDValue Op0 = N->getOperand(0);
45982 SDValue Op1 = N->getOperand(1);
45983
45984 // X86 can't encode an immediate LHS of a sub. See if we can push the
45985 // negation into a preceding instruction.
45986 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
45987 // If the RHS of the sub is a XOR with one use and a constant, invert the
45988 // immediate. Then add one to the LHS of the sub so we can turn
45989 // X-Y -> X+~Y+1, saving one register.
45990 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
45991 isa<ConstantSDNode>(Op1.getOperand(1))) {
45992 const APInt &XorC = Op1.getConstantOperandAPInt(1);
45993 EVT VT = Op0.getValueType();
45994 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
45995 Op1.getOperand(0),
45996 DAG.getConstant(~XorC, SDLoc(Op1), VT));
45997 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
45998 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
45999 }
46000 }
46001
46002 // Try to synthesize horizontal subs from subs of shuffles.
46003 EVT VT = N->getValueType(0);
46004 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
46005 VT == MVT::v8i32) &&
46006 Subtarget.hasSSSE3() &&
46007 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
46008 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46009 ArrayRef<SDValue> Ops) {
46010 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
46011 };
46012 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
46013 HSUBBuilder);
46014 }
46015
46016 // Try to create PSUBUS if SUB's argument is max/min
46017 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
46018 return V;
46019
46020 return combineAddOrSubToADCOrSBB(N, DAG);
46021}
46022
46023static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
46024 const X86Subtarget &Subtarget) {
46025 MVT VT = N->getSimpleValueType(0);
46026 SDLoc DL(N);
46027
46028 if (N->getOperand(0) == N->getOperand(1)) {
46029 if (N->getOpcode() == X86ISD::PCMPEQ)
46030 return DAG.getConstant(-1, DL, VT);
46031 if (N->getOpcode() == X86ISD::PCMPGT)
46032 return DAG.getConstant(0, DL, VT);
46033 }
46034
46035 return SDValue();
46036}
46037
46038/// Helper that combines an array of subvector ops as if they were the operands
46039/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
46040/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
46041static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
46042 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
46043 TargetLowering::DAGCombinerInfo &DCI,
46044 const X86Subtarget &Subtarget) {
46045 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46045, __PRETTY_FUNCTION__))
;
46046 unsigned EltSizeInBits = VT.getScalarSizeInBits();
46047
46048 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
46049 return DAG.getUNDEF(VT);
46050
46051 if (llvm::all_of(Ops, [](SDValue Op) {
46052 return ISD::isBuildVectorAllZeros(Op.getNode());
46053 }))
46054 return getZeroVector(VT, Subtarget, DAG, DL);
46055
46056 SDValue Op0 = Ops[0];
46057 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
46058
46059 // Fold subvector loads into one.
46060 // If needed, look through bitcasts to get to the load.
46061 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
46062 bool Fast;
46063 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
46064 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46065 *FirstLd->getMemOperand(), &Fast) &&
46066 Fast) {
46067 if (SDValue Ld =
46068 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
46069 return Ld;
46070 }
46071 }
46072
46073 // Repeated subvectors.
46074 if (IsSplat) {
46075 // If this broadcast/subv_broadcast is inserted into both halves, use a
46076 // larger broadcast/subv_broadcast.
46077 if (Op0.getOpcode() == X86ISD::VBROADCAST ||
46078 Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
46079 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
46080
46081 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
46082 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
46083 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
46084 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
46085 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
46086 Op0.getOperand(0),
46087 DAG.getIntPtrConstant(0, DL)));
46088
46089 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
46090 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46091 (Subtarget.hasAVX2() ||
46092 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
46093 Op0.getOperand(0).getValueType() == VT.getScalarType())
46094 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
46095 }
46096
46097 // Repeated opcode.
46098 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
46099 // but it currently struggles with different vector widths.
46100 if (llvm::all_of(Ops, [Op0](SDValue Op) {
46101 return Op.getOpcode() == Op0.getOpcode();
46102 })) {
46103 unsigned NumOps = Ops.size();
46104 switch (Op0.getOpcode()) {
46105 case X86ISD::PSHUFHW:
46106 case X86ISD::PSHUFLW:
46107 case X86ISD::PSHUFD:
46108 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
46109 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
46110 SmallVector<SDValue, 2> Src;
46111 for (unsigned i = 0; i != NumOps; ++i)
46112 Src.push_back(Ops[i].getOperand(0));
46113 return DAG.getNode(Op0.getOpcode(), DL, VT,
46114 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46115 Op0.getOperand(1));
46116 }
46117 LLVM_FALLTHROUGH[[gnu::fallthrough]];
46118 case X86ISD::VPERMILPI:
46119 // TODO - add support for vXf64/vXi64 shuffles.
46120 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
46121 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
46122 SmallVector<SDValue, 2> Src;
46123 for (unsigned i = 0; i != NumOps; ++i)
46124 Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
46125 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
46126 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
46127 Op0.getOperand(1));
46128 return DAG.getBitcast(VT, Res);
46129 }
46130 break;
46131 case X86ISD::VSHLI:
46132 case X86ISD::VSRAI:
46133 case X86ISD::VSRLI:
46134 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
46135 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
46136 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
46137 llvm::all_of(Ops, [Op0](SDValue Op) {
46138 return Op0.getOperand(1) == Op.getOperand(1);
46139 })) {
46140 SmallVector<SDValue, 2> Src;
46141 for (unsigned i = 0; i != NumOps; ++i)
46142 Src.push_back(Ops[i].getOperand(0));
46143 return DAG.getNode(Op0.getOpcode(), DL, VT,
46144 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46145 Op0.getOperand(1));
46146 }
46147 break;
46148 case X86ISD::VPERMI:
46149 case X86ISD::VROTLI:
46150 case X86ISD::VROTRI:
46151 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
46152 llvm::all_of(Ops, [Op0](SDValue Op) {
46153 return Op0.getOperand(1) == Op.getOperand(1);
46154 })) {
46155 SmallVector<SDValue, 2> Src;
46156 for (unsigned i = 0; i != NumOps; ++i)
46157 Src.push_back(Ops[i].getOperand(0));
46158 return DAG.getNode(Op0.getOpcode(), DL, VT,
46159 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46160 Op0.getOperand(1));
46161 }
46162 break;
46163 case X86ISD::PACKSS:
46164 case X86ISD::PACKUS:
46165 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
46166 Subtarget.hasInt256()) {
46167 SmallVector<SDValue, 2> LHS, RHS;
46168 for (unsigned i = 0; i != NumOps; ++i) {
46169 LHS.push_back(Ops[i].getOperand(0));
46170 RHS.push_back(Ops[i].getOperand(1));
46171 }
46172 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
46173 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
46174 NumOps * SrcVT.getVectorNumElements());
46175 return DAG.getNode(Op0.getOpcode(), DL, VT,
46176 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
46177 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
46178 }
46179 break;
46180 }
46181 }
46182
46183 return SDValue();
46184}
46185
46186static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
46187 TargetLowering::DAGCombinerInfo &DCI,
46188 const X86Subtarget &Subtarget) {
46189 EVT VT = N->getValueType(0);
46190 EVT SrcVT = N->getOperand(0).getValueType();
46191 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46192
46193 // Don't do anything for i1 vectors.
46194 if (VT.getVectorElementType() == MVT::i1)
46195 return SDValue();
46196
46197 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
46198 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
46199 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
46200 DCI, Subtarget))
46201 return R;
46202 }
46203
46204 return SDValue();
46205}
46206
46207static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
46208 TargetLowering::DAGCombinerInfo &DCI,
46209 const X86Subtarget &Subtarget) {
46210 if (DCI.isBeforeLegalizeOps())
46211 return SDValue();
46212
46213 MVT OpVT = N->getSimpleValueType(0);
46214
46215 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
46216
46217 SDLoc dl(N);
46218 SDValue Vec = N->getOperand(0);
46219 SDValue SubVec = N->getOperand(1);
46220
46221 uint64_t IdxVal = N->getConstantOperandVal(2);
46222 MVT SubVecVT = SubVec.getSimpleValueType();
46223
46224 if (Vec.isUndef() && SubVec.isUndef())
46225 return DAG.getUNDEF(OpVT);
46226
46227 // Inserting undefs/zeros into zeros/undefs is a zero vector.
46228 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
46229 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
46230 return getZeroVector(OpVT, Subtarget, DAG, dl);
46231
46232 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
46233 // If we're inserting into a zero vector and then into a larger zero vector,
46234 // just insert into the larger zero vector directly.
46235 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
46236 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
46237 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
46238 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46239 getZeroVector(OpVT, Subtarget, DAG, dl),
46240 SubVec.getOperand(1),
46241 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
46242 }
46243
46244 // If we're inserting into a zero vector and our input was extracted from an
46245 // insert into a zero vector of the same type and the extraction was at
46246 // least as large as the original insertion. Just insert the original
46247 // subvector into a zero vector.
46248 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
46249 isNullConstant(SubVec.getOperand(1)) &&
46250 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
46251 SDValue Ins = SubVec.getOperand(0);
46252 if (isNullConstant(Ins.getOperand(2)) &&
46253 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
46254 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
46255 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46256 getZeroVector(OpVT, Subtarget, DAG, dl),
46257 Ins.getOperand(1), N->getOperand(2));
46258 }
46259 }
46260
46261 // Stop here if this is an i1 vector.
46262 if (IsI1Vector)
46263 return SDValue();
46264
46265 // If this is an insert of an extract, combine to a shuffle. Don't do this
46266 // if the insert or extract can be represented with a subregister operation.
46267 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46268 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
46269 (IdxVal != 0 || !Vec.isUndef())) {
46270 int ExtIdxVal = SubVec.getConstantOperandVal(1);
46271 if (ExtIdxVal != 0) {
46272 int VecNumElts = OpVT.getVectorNumElements();
46273 int SubVecNumElts = SubVecVT.getVectorNumElements();
46274 SmallVector<int, 64> Mask(VecNumElts);
46275 // First create an identity shuffle mask.
46276 for (int i = 0; i != VecNumElts; ++i)
46277 Mask[i] = i;
46278 // Now insert the extracted portion.
46279 for (int i = 0; i != SubVecNumElts; ++i)
46280 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
46281
46282 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
46283 }
46284 }
46285
46286 // Match concat_vector style patterns.
46287 SmallVector<SDValue, 2> SubVectorOps;
46288 if (collectConcatOps(N, SubVectorOps)) {
46289 if (SDValue Fold =
46290 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
46291 return Fold;
46292
46293 // If we're inserting all zeros into the upper half, change this to
46294 // a concat with zero. We will match this to a move
46295 // with implicit upper bit zeroing during isel.
46296 // We do this here because we don't want combineConcatVectorOps to
46297 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
46298 if (SubVectorOps.size() == 2 &&
46299 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
46300 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46301 getZeroVector(OpVT, Subtarget, DAG, dl),
46302 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
46303 }
46304
46305 // If this is a broadcast insert into an upper undef, use a larger broadcast.
46306 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
46307 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
46308
46309 // If this is a broadcast load inserted into an upper undef, use a larger
46310 // broadcast load.
46311 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
46312 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
46313 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
46314 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
46315 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
46316 SDValue BcastLd =
46317 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
46318 MemIntr->getMemoryVT(),
46319 MemIntr->getMemOperand());
46320 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
46321 return BcastLd;
46322 }
46323
46324 return SDValue();
46325}
46326
46327/// If we are extracting a subvector of a vector select and the select condition
46328/// is composed of concatenated vectors, try to narrow the select width. This
46329/// is a common pattern for AVX1 integer code because 256-bit selects may be
46330/// legal, but there is almost no integer math/logic available for 256-bit.
46331/// This function should only be called with legal types (otherwise, the calls
46332/// to get simple value types will assert).
46333static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
46334 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
46335 SmallVector<SDValue, 4> CatOps;
46336 if (Sel.getOpcode() != ISD::VSELECT ||
46337 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
46338 return SDValue();
46339
46340 // Note: We assume simple value types because this should only be called with
46341 // legal operations/types.
46342 // TODO: This can be extended to handle extraction to 256-bits.
46343 MVT VT = Ext->getSimpleValueType(0);
46344 if (!VT.is128BitVector())
46345 return SDValue();
46346
46347 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
46348 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
46349 return SDValue();
46350
46351 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
46352 MVT SelVT = Sel.getSimpleValueType();
46353 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46354, __PRETTY_FUNCTION__))
46354 "Unexpected vector type with legal operations")(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46354, __PRETTY_FUNCTION__))
;
46355
46356 unsigned SelElts = SelVT.getVectorNumElements();
46357 unsigned CastedElts = WideVT.getVectorNumElements();
46358 unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
46359 if (SelElts % CastedElts == 0) {
46360 // The select has the same or more (narrower) elements than the extract
46361 // operand. The extraction index gets scaled by that factor.
46362 ExtIdx *= (SelElts / CastedElts);
46363 } else if (CastedElts % SelElts == 0) {
46364 // The select has less (wider) elements than the extract operand. Make sure
46365 // that the extraction index can be divided evenly.
46366 unsigned IndexDivisor = CastedElts / SelElts;
46367 if (ExtIdx % IndexDivisor != 0)
46368 return SDValue();
46369 ExtIdx /= IndexDivisor;
46370 } else {
46371 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46371)
;
46372 }
46373
46374 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
46375 unsigned NarrowElts = SelElts / NarrowingFactor;
46376 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
46377 SDLoc DL(Ext);
46378 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
46379 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
46380 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
46381 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
46382 return DAG.getBitcast(VT, NarrowSel);
46383}
46384
46385static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
46386 TargetLowering::DAGCombinerInfo &DCI,
46387 const X86Subtarget &Subtarget) {
46388 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
46389 // eventually get combined/lowered into ANDNP) with a concatenated operand,
46390 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
46391 // We let generic combining take over from there to simplify the
46392 // insert/extract and 'not'.
46393 // This pattern emerges during AVX1 legalization. We handle it before lowering
46394 // to avoid complications like splitting constant vector loads.
46395
46396 // Capture the original wide type in the likely case that we need to bitcast
46397 // back to this type.
46398 if (!N->getValueType(0).isSimple())
46399 return SDValue();
46400
46401 MVT VT = N->getSimpleValueType(0);
46402 SDValue InVec = N->getOperand(0);
46403 SDValue InVecBC = peekThroughBitcasts(InVec);
46404 EVT InVecVT = InVec.getValueType();
46405 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46406
46407 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
46408 TLI.isTypeLegal(InVecVT) &&
46409 InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
46410 auto isConcatenatedNot = [] (SDValue V) {
46411 V = peekThroughBitcasts(V);
46412 if (!isBitwiseNot(V))
46413 return false;
46414 SDValue NotOp = V->getOperand(0);
46415 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
46416 };
46417 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
46418 isConcatenatedNot(InVecBC.getOperand(1))) {
46419 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
46420 SDValue Concat = split256IntArith(InVecBC, DAG);
46421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
46422 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
46423 }
46424 }
46425
46426 if (DCI.isBeforeLegalizeOps())
46427 return SDValue();
46428
46429 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
46430 return V;
46431
46432 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
46433
46434 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
46435 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
46436
46437 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
46438 if (VT.getScalarType() == MVT::i1)
46439 return DAG.getConstant(1, SDLoc(N), VT);
46440 return getOnesVector(VT, DAG, SDLoc(N));
46441 }
46442
46443 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
46444 return DAG.getBuildVector(
46445 VT, SDLoc(N),
46446 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
46447
46448 // If we are extracting from an insert into a zero vector, replace with a
46449 // smaller insert into zero if we don't access less than the original
46450 // subvector. Don't do this for i1 vectors.
46451 if (VT.getVectorElementType() != MVT::i1 &&
46452 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
46453 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
46454 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
46455 InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
46456 SDLoc DL(N);
46457 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
46458 getZeroVector(VT, Subtarget, DAG, DL),
46459 InVec.getOperand(1), InVec.getOperand(2));
46460 }
46461
46462 // If we're extracting from a broadcast then we're better off just
46463 // broadcasting to the smaller type directly, assuming this is the only use.
46464 // As its a broadcast we don't care about the extraction index.
46465 if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
46466 InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
46467 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
46468
46469 if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
46470 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
46471 if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
46472 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
46473 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
46474 SDValue BcastLd =
46475 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
46476 MemIntr->getMemoryVT(),
46477 MemIntr->getMemOperand());
46478 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
46479 return BcastLd;
46480 }
46481 }
46482
46483 // If we're extracting the lowest subvector and we're the only user,
46484 // we may be able to perform this with a smaller vector width.
46485 if (IdxVal == 0 && InVec.hasOneUse()) {
46486 unsigned InOpcode = InVec.getOpcode();
46487 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
46488 // v2f64 CVTDQ2PD(v4i32).
46489 if (InOpcode == ISD::SINT_TO_FP &&
46490 InVec.getOperand(0).getValueType() == MVT::v4i32) {
46491 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
46492 }
46493 // v2f64 CVTUDQ2PD(v4i32).
46494 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
46495 InVec.getOperand(0).getValueType() == MVT::v4i32) {
46496 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
46497 }
46498 // v2f64 CVTPS2PD(v4f32).
46499 if (InOpcode == ISD::FP_EXTEND &&
46500 InVec.getOperand(0).getValueType() == MVT::v4f32) {
46501 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
46502 }
46503 }
46504 if ((InOpcode == ISD::ANY_EXTEND ||
46505 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
46506 InOpcode == ISD::ZERO_EXTEND ||
46507 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
46508 InOpcode == ISD::SIGN_EXTEND ||
46509 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46510 VT.is128BitVector() &&
46511 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
46512 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
46513 return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
46514 }
46515 if (InOpcode == ISD::VSELECT &&
46516 InVec.getOperand(0).getValueType().is256BitVector() &&
46517 InVec.getOperand(1).getValueType().is256BitVector() &&
46518 InVec.getOperand(2).getValueType().is256BitVector()) {
46519 SDLoc DL(N);
46520 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
46521 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
46522 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
46523 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
46524 }
46525 }
46526
46527 return SDValue();
46528}
46529
46530static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
46531 EVT VT = N->getValueType(0);
46532 SDValue Src = N->getOperand(0);
46533 SDLoc DL(N);
46534
46535 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
46536 // This occurs frequently in our masked scalar intrinsic code and our
46537 // floating point select lowering with AVX512.
46538 // TODO: SimplifyDemandedBits instead?
46539 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
46540 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
46541 if (C->getAPIntValue().isOneValue())
46542 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
46543 Src.getOperand(0));
46544
46545 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
46546 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46547 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
46548 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
46549 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
46550 if (C->isNullValue())
46551 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
46552 Src.getOperand(1));
46553
46554 // Reduce v2i64 to v4i32 if we don't need the upper bits.
46555 // TODO: Move to DAGCombine?
46556 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
46557 Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
46558 Src.getOperand(0).getScalarValueSizeInBits() <= 32)
46559 return DAG.getBitcast(
46560 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
46561 DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
46562
46563 return SDValue();
46564}
46565
46566// Simplify PMULDQ and PMULUDQ operations.
46567static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
46568 TargetLowering::DAGCombinerInfo &DCI,
46569 const X86Subtarget &Subtarget) {
46570 SDValue LHS = N->getOperand(0);
46571 SDValue RHS = N->getOperand(1);
46572
46573 // Canonicalize constant to RHS.
46574 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
46575 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
46576 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
46577
46578 // Multiply by zero.
46579 // Don't return RHS as it may contain UNDEFs.
46580 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
46581 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
46582
46583 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
46584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46585 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
46586 return SDValue(N, 0);
46587
46588 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
46589 // convert it to any_extend_invec, due to the LegalOperations check, do the
46590 // conversion directly to a vector shuffle manually. This exposes combine
46591 // opportunities missed by combineExtInVec not calling
46592 // combineX86ShufflesRecursively on SSE4.1 targets.
46593 // FIXME: This is basically a hack around several other issues related to
46594 // ANY_EXTEND_VECTOR_INREG.
46595 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
46596 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
46597 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46598 LHS.getOperand(0).getValueType() == MVT::v4i32) {
46599 SDLoc dl(N);
46600 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
46601 LHS.getOperand(0), { 0, -1, 1, -1 });
46602 LHS = DAG.getBitcast(MVT::v2i64, LHS);
46603 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
46604 }
46605 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
46606 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
46607 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46608 RHS.getOperand(0).getValueType() == MVT::v4i32) {
46609 SDLoc dl(N);
46610 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
46611 RHS.getOperand(0), { 0, -1, 1, -1 });
46612 RHS = DAG.getBitcast(MVT::v2i64, RHS);
46613 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
46614 }
46615
46616 return SDValue();
46617}
46618
46619static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
46620 TargetLowering::DAGCombinerInfo &DCI,
46621 const X86Subtarget &Subtarget) {
46622 EVT VT = N->getValueType(0);
46623 SDValue In = N->getOperand(0);
46624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46625
46626 // Try to merge vector loads and extend_inreg to an extload.
46627 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
46628 In.hasOneUse()) {
46629 auto *Ld = cast<LoadSDNode>(In);
46630 if (Ld->isSimple()) {
46631 MVT SVT = In.getSimpleValueType().getVectorElementType();
46632 ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
46633 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
46634 VT.getVectorNumElements());
46635 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
46636 SDValue Load =
46637 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
46638 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
46639 Ld->getMemOperand()->getFlags());
46640 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
46641 return Load;
46642 }
46643 }
46644 }
46645
46646 // Attempt to combine as a shuffle.
46647 // TODO: SSE41 support
46648 if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
46649 SDValue Op(N, 0);
46650 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
46651 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46652 return Res;
46653 }
46654
46655 return SDValue();
46656}
46657
46658static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
46659 TargetLowering::DAGCombinerInfo &DCI) {
46660 EVT VT = N->getValueType(0);
46661
46662 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
46663 return DAG.getConstant(0, SDLoc(N), VT);
46664
46665 APInt KnownUndef, KnownZero;
46666 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46667 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
46668 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
46669 KnownZero, DCI))
46670 return SDValue(N, 0);
46671
46672 return SDValue();
46673}
46674
46675// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
46676// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
46677// extra instructions between the conversion due to going to scalar and back.
46678static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
46679 const X86Subtarget &Subtarget) {
46680 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
46681 return SDValue();
46682
46683 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
46684 return SDValue();
46685
46686 if (N->getValueType(0) != MVT::f32 ||
46687 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
46688 return SDValue();
46689
46690 SDLoc dl(N);
46691 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
46692 N->getOperand(0).getOperand(0));
46693 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
46694 DAG.getTargetConstant(4, dl, MVT::i32));
46695 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
46696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
46697 DAG.getIntPtrConstant(0, dl));
46698}
46699
46700static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
46701 const X86Subtarget &Subtarget) {
46702 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
46703 return SDValue();
46704
46705 EVT VT = N->getValueType(0);
46706 SDValue Src = N->getOperand(0);
46707 EVT SrcVT = Src.getValueType();
46708
46709 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
46710 return SDValue();
46711
46712 if (VT.getVectorElementType() != MVT::f32 &&
46713 VT.getVectorElementType() != MVT::f64)
46714 return SDValue();
46715
46716 unsigned NumElts = VT.getVectorNumElements();
46717 if (NumElts == 1 || !isPowerOf2_32(NumElts))
46718 return SDValue();
46719
46720 SDLoc dl(N);
46721
46722 // Convert the input to vXi16.
46723 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
46724 Src = DAG.getBitcast(IntVT, Src);
46725
46726 // Widen to at least 8 input elements.
46727 if (NumElts < 8) {
46728 unsigned NumConcats = 8 / NumElts;
46729 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
46730 : DAG.getConstant(0, dl, IntVT);
46731 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
46732 Ops[0] = Src;
46733 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
46734 }
46735
46736 // Destination is vXf32 with at least 4 elements.
46737 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
46738 std::max(4U, NumElts));
46739 SDValue Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
46740
46741 if (NumElts < 4) {
46742 assert(NumElts == 2 && "Unexpected size")((NumElts == 2 && "Unexpected size") ? static_cast<
void> (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46742, __PRETTY_FUNCTION__))
;
46743 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
46744 DAG.getIntPtrConstant(0, dl));
46745 }
46746
46747 // Extend to the original VT if necessary.
46748 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
46749}
46750
46751static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
46752 const X86Subtarget &Subtarget) {
46753 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
46754 return SDValue();
46755
46756 EVT VT = N->getValueType(0);
46757 SDValue Src = N->getOperand(0);
46758 EVT SrcVT = Src.getValueType();
46759
46760 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
46761 SrcVT.getVectorElementType() != MVT::f32)
46762 return SDValue();
46763
46764 unsigned NumElts = VT.getVectorNumElements();
46765 if (NumElts == 1 || !isPowerOf2_32(NumElts))
46766 return SDValue();
46767
46768 SDLoc dl(N);
46769
46770 // Widen to at least 4 input elements.
46771 if (NumElts < 4)
46772 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
46773 DAG.getConstantFP(0.0, dl, SrcVT));
46774
46775 // Destination is v8i16 with at least 8 elements.
46776 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
46777 std::max(8U, NumElts));
46778 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
46779 DAG.getTargetConstant(4, dl, MVT::i32));
46780
46781 // Extract down to real number of elements.
46782 if (NumElts < 8) {
46783 EVT IntVT = VT.changeVectorElementTypeToInteger();
46784 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
46785 DAG.getIntPtrConstant(0, dl));
46786 }
46787
46788 return DAG.getBitcast(VT, Cvt);
46789}
46790
46791SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
46792 DAGCombinerInfo &DCI) const {
46793 SelectionDAG &DAG = DCI.DAG;
46794 switch (N->getOpcode()) {
46795 default: break;
46796 case ISD::SCALAR_TO_VECTOR:
46797 return combineScalarToVector(N, DAG);
46798 case ISD::EXTRACT_VECTOR_ELT:
46799 case X86ISD::PEXTRW:
46800 case X86ISD::PEXTRB:
46801 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
46802 case ISD::CONCAT_VECTORS:
46803 return combineConcatVectors(N, DAG, DCI, Subtarget);
46804 case ISD::INSERT_SUBVECTOR:
46805 return combineInsertSubvector(N, DAG, DCI, Subtarget);
46806 case ISD::EXTRACT_SUBVECTOR:
46807 return combineExtractSubvector(N, DAG, DCI, Subtarget);
46808 case ISD::VSELECT:
46809 case ISD::SELECT:
46810 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
46811 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
46812 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
46813 case X86ISD::CMP: return combineCMP(N, DAG);
46814 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
46815 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
46816 case X86ISD::ADD:
46817 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
46818 case X86ISD::SBB: return combineSBB(N, DAG);
46819 case X86ISD::ADC: return combineADC(N, DAG, DCI);
46820 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
46821 case ISD::SHL: return combineShiftLeft(N, DAG);
46822 case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
46823 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
46824 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
46825 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
46826 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
46827 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
46828 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
46829 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
46830 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
46831 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
46832 case X86ISD::VEXTRACT_STORE:
46833 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
46834 case ISD::SINT_TO_FP:
46835 case ISD::STRICT_SINT_TO_FP:
46836 return combineSIntToFP(N, DAG, DCI, Subtarget);
46837 case ISD::UINT_TO_FP:
46838 case ISD::STRICT_UINT_TO_FP:
46839 return combineUIntToFP(N, DAG, Subtarget);
46840 case ISD::FADD:
46841 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
46842 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
46843 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
46844 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
46845 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
46846 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
46847 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
46848 case X86ISD::FXOR:
46849 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
46850 case X86ISD::FMIN:
46851 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
46852 case ISD::FMINNUM:
46853 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
46854 case X86ISD::CVTSI2P:
46855 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
46856 case X86ISD::CVTP2SI:
46857 case X86ISD::CVTP2UI:
46858 case X86ISD::CVTTP2SI:
46859 case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
46860 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
46861 case X86ISD::BT: return combineBT(N, DAG, DCI);
46862 case ISD::ANY_EXTEND:
46863 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
46864 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
46865 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
46866 case ISD::ANY_EXTEND_VECTOR_INREG:
46867 case ISD::SIGN_EXTEND_VECTOR_INREG:
46868 case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
46869 Subtarget);
46870 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
46871 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
46872 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
46873 case X86ISD::PACKSS:
46874 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
46875 case X86ISD::VSHL:
46876 case X86ISD::VSRA:
46877 case X86ISD::VSRL:
46878 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
46879 case X86ISD::VSHLI:
46880 case X86ISD::VSRAI:
46881 case X86ISD::VSRLI:
46882 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
46883 case X86ISD::PINSRB:
46884 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
46885 case X86ISD::SHUFP: // Handle all target specific shuffles
46886 case X86ISD::INSERTPS:
46887 case X86ISD::EXTRQI:
46888 case X86ISD::INSERTQI:
46889 case X86ISD::PALIGNR:
46890 case X86ISD::VSHLDQ:
46891 case X86ISD::VSRLDQ:
46892 case X86ISD::BLENDI:
46893 case X86ISD::UNPCKH:
46894 case X86ISD::UNPCKL:
46895 case X86ISD::MOVHLPS:
46896 case X86ISD::MOVLHPS:
46897 case X86ISD::PSHUFB:
46898 case X86ISD::PSHUFD:
46899 case X86ISD::PSHUFHW:
46900 case X86ISD::PSHUFLW:
46901 case X86ISD::MOVSHDUP:
46902 case X86ISD::MOVSLDUP:
46903 case X86ISD::MOVDDUP:
46904 case X86ISD::MOVSS:
46905 case X86ISD::MOVSD:
46906 case X86ISD::VBROADCAST:
46907 case X86ISD::VPPERM:
46908 case X86ISD::VPERMI:
46909 case X86ISD::VPERMV:
46910 case X86ISD::VPERMV3:
46911 case X86ISD::VPERMIL2:
46912 case X86ISD::VPERMILPI:
46913 case X86ISD::VPERMILPV:
46914 case X86ISD::VPERM2X128:
46915 case X86ISD::SHUF128:
46916 case X86ISD::VZEXT_MOVL:
46917 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
46918 case X86ISD::FMADD_RND:
46919 case X86ISD::FMSUB:
46920 case X86ISD::STRICT_FMSUB:
46921 case X86ISD::FMSUB_RND:
46922 case X86ISD::FNMADD:
46923 case X86ISD::STRICT_FNMADD:
46924 case X86ISD::FNMADD_RND:
46925 case X86ISD::FNMSUB:
46926 case X86ISD::STRICT_FNMSUB:
46927 case X86ISD::FNMSUB_RND:
46928 case ISD::FMA:
46929 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
46930 case X86ISD::FMADDSUB_RND:
46931 case X86ISD::FMSUBADD_RND:
46932 case X86ISD::FMADDSUB:
46933 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
46934 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
46935 case X86ISD::MGATHER:
46936 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
46937 case ISD::MGATHER:
46938 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
46939 case X86ISD::PCMPEQ:
46940 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
46941 case X86ISD::PMULDQ:
46942 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
46943 case X86ISD::KSHIFTL:
46944 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
46945 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
46946 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
46947 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
46948 }
46949
46950 return SDValue();
46951}
46952
46953bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
46954 if (!isTypeLegal(VT))
46955 return false;
46956
46957 // There are no vXi8 shifts.
46958 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
46959 return false;
46960
46961 // TODO: Almost no 8-bit ops are desirable because they have no actual
46962 // size/speed advantages vs. 32-bit ops, but they do have a major
46963 // potential disadvantage by causing partial register stalls.
46964 //
46965 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
46966 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
46967 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
46968 // check for a constant operand to the multiply.
46969 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
46970 return false;
46971
46972 // i16 instruction encodings are longer and some i16 instructions are slow,
46973 // so those are not desirable.
46974 if (VT == MVT::i16) {
46975 switch (Opc) {
46976 default:
46977 break;
46978 case ISD::LOAD:
46979 case ISD::SIGN_EXTEND:
46980 case ISD::ZERO_EXTEND:
46981 case ISD::ANY_EXTEND:
46982 case ISD::SHL:
46983 case ISD::SRA:
46984 case ISD::SRL:
46985 case ISD::SUB:
46986 case ISD::ADD:
46987 case ISD::MUL:
46988 case ISD::AND:
46989 case ISD::OR:
46990 case ISD::XOR:
46991 return false;
46992 }
46993 }
46994
46995 // Any legal type not explicitly accounted for above here is desirable.
46996 return true;
46997}
46998
46999SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
47000 SDValue Value, SDValue Addr,
47001 SelectionDAG &DAG) const {
47002 const Module *M = DAG.getMachineFunction().getMMI().getModule();
47003 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
47004 if (IsCFProtectionSupported) {
47005 // In case control-flow branch protection is enabled, we need to add
47006 // notrack prefix to the indirect branch.
47007 // In order to do that we create NT_BRIND SDNode.
47008 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
47009 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
47010 }
47011
47012 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
47013}
47014
47015bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
47016 EVT VT = Op.getValueType();
47017 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
47018 isa<ConstantSDNode>(Op.getOperand(1));
47019
47020 // i16 is legal, but undesirable since i16 instruction encodings are longer
47021 // and some i16 instructions are slow.
47022 // 8-bit multiply-by-constant can usually be expanded to something cheaper
47023 // using LEA and/or other ALU ops.
47024 if (VT != MVT::i16 && !Is8BitMulByConstant)
47025 return false;
47026
47027 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
47028 if (!Op.hasOneUse())
47029 return false;
47030 SDNode *User = *Op->use_begin();
47031 if (!ISD::isNormalStore(User))
47032 return false;
47033 auto *Ld = cast<LoadSDNode>(Load);
47034 auto *St = cast<StoreSDNode>(User);
47035 return Ld->getBasePtr() == St->getBasePtr();
47036 };
47037
47038 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
47039 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
47040 return false;
47041 if (!Op.hasOneUse())
47042 return false;
47043 SDNode *User = *Op->use_begin();
47044 if (User->getOpcode() != ISD::ATOMIC_STORE)
47045 return false;
47046 auto *Ld = cast<AtomicSDNode>(Load);
47047 auto *St = cast<AtomicSDNode>(User);
47048 return Ld->getBasePtr() == St->getBasePtr();
47049 };
47050
47051 bool Commute = false;
47052 switch (Op.getOpcode()) {
47053 default: return false;
47054 case ISD::SIGN_EXTEND:
47055 case ISD::ZERO_EXTEND:
47056 case ISD::ANY_EXTEND:
47057 break;
47058 case ISD::SHL:
47059 case ISD::SRA:
47060 case ISD::SRL: {
47061 SDValue N0 = Op.getOperand(0);
47062 // Look out for (store (shl (load), x)).
47063 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
47064 return false;
47065 break;
47066 }
47067 case ISD::ADD:
47068 case ISD::MUL:
47069 case ISD::AND:
47070 case ISD::OR:
47071 case ISD::XOR:
47072 Commute = true;
47073 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47074 case ISD::SUB: {
47075 SDValue N0 = Op.getOperand(0);
47076 SDValue N1 = Op.getOperand(1);
47077 // Avoid disabling potential load folding opportunities.
47078 if (MayFoldLoad(N1) &&
47079 (!Commute || !isa<ConstantSDNode>(N0) ||
47080 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
47081 return false;
47082 if (MayFoldLoad(N0) &&
47083 ((Commute && !isa<ConstantSDNode>(N1)) ||
47084 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
47085 return false;
47086 if (IsFoldableAtomicRMW(N0, Op) ||
47087 (Commute && IsFoldableAtomicRMW(N1, Op)))
47088 return false;
47089 }
47090 }
47091
47092 PVT = MVT::i32;
47093 return true;
47094}
47095
47096//===----------------------------------------------------------------------===//
47097// X86 Inline Assembly Support
47098//===----------------------------------------------------------------------===//
47099
47100// Helper to match a string separated by whitespace.
47101static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
47102 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
47103
47104 for (StringRef Piece : Pieces) {
47105 if (!S.startswith(Piece)) // Check if the piece matches.
47106 return false;
47107
47108 S = S.substr(Piece.size());
47109 StringRef::size_type Pos = S.find_first_not_of(" \t");
47110 if (Pos == 0) // We matched a prefix.
47111 return false;
47112
47113 S = S.substr(Pos);
47114 }
47115
47116 return S.empty();
47117}
47118
47119static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
47120
47121 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
47122 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
47123 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
47124 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
47125
47126 if (AsmPieces.size() == 3)
47127 return true;
47128 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
47129 return true;
47130 }
47131 }
47132 return false;
47133}
47134
47135bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
47136 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
47137
47138 const std::string &AsmStr = IA->getAsmString();
47139
47140 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
47141 if (!Ty || Ty->getBitWidth() % 16 != 0)
47142 return false;
47143
47144 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
47145 SmallVector<StringRef, 4> AsmPieces;
47146 SplitString(AsmStr, AsmPieces, ";\n");
47147
47148 switch (AsmPieces.size()) {
47149 default: return false;
47150 case 1:
47151 // FIXME: this should verify that we are targeting a 486 or better. If not,
47152 // we will turn this bswap into something that will be lowered to logical
47153 // ops instead of emitting the bswap asm. For now, we don't support 486 or
47154 // lower so don't worry about this.
47155 // bswap $0
47156 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
47157 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
47158 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
47159 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
47160 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
47161 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
47162 // No need to check constraints, nothing other than the equivalent of
47163 // "=r,0" would be valid here.
47164 return IntrinsicLowering::LowerToByteSwap(CI);
47165 }
47166
47167 // rorw $$8, ${0:w} --> llvm.bswap.i16
47168 if (CI->getType()->isIntegerTy(16) &&
47169 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
47170 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
47171 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
47172 AsmPieces.clear();
47173 StringRef ConstraintsStr = IA->getConstraintString();
47174 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
47175 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
47176 if (clobbersFlagRegisters(AsmPieces))
47177 return IntrinsicLowering::LowerToByteSwap(CI);
47178 }
47179 break;
47180 case 3:
47181 if (CI->getType()->isIntegerTy(32) &&
47182 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
47183 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
47184 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
47185 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
47186 AsmPieces.clear();
47187 StringRef ConstraintsStr = IA->getConstraintString();
47188 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
47189 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
47190 if (clobbersFlagRegisters(AsmPieces))
47191 return IntrinsicLowering::LowerToByteSwap(CI);
47192 }
47193
47194 if (CI->getType()->isIntegerTy(64)) {
47195 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
47196 if (Constraints.size() >= 2 &&
47197 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
47198 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
47199 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
47200 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
47201 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
47202 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
47203 return IntrinsicLowering::LowerToByteSwap(CI);
47204 }
47205 }
47206 break;
47207 }
47208 return false;
47209}
47210
47211static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
47212 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
47213 .Case("{@cca}", X86::COND_A)
47214 .Case("{@ccae}", X86::COND_AE)
47215 .Case("{@ccb}", X86::COND_B)
47216 .Case("{@ccbe}", X86::COND_BE)
47217 .Case("{@ccc}", X86::COND_B)
47218 .Case("{@cce}", X86::COND_E)
47219 .Case("{@ccz}", X86::COND_E)
47220 .Case("{@ccg}", X86::COND_G)
47221 .Case("{@ccge}", X86::COND_GE)
47222 .Case("{@ccl}", X86::COND_L)
47223 .Case("{@ccle}", X86::COND_LE)
47224 .Case("{@ccna}", X86::COND_BE)
47225 .Case("{@ccnae}", X86::COND_B)
47226 .Case("{@ccnb}", X86::COND_AE)
47227 .Case("{@ccnbe}", X86::COND_A)
47228 .Case("{@ccnc}", X86::COND_AE)
47229 .Case("{@ccne}", X86::COND_NE)
47230 .Case("{@ccnz}", X86::COND_NE)
47231 .Case("{@ccng}", X86::COND_LE)
47232 .Case("{@ccnge}", X86::COND_L)
47233 .Case("{@ccnl}", X86::COND_GE)
47234 .Case("{@ccnle}", X86::COND_G)
47235 .Case("{@ccno}", X86::COND_NO)
47236 .Case("{@ccnp}", X86::COND_P)
47237 .Case("{@ccns}", X86::COND_NS)
47238 .Case("{@cco}", X86::COND_O)
47239 .Case("{@ccp}", X86::COND_P)
47240 .Case("{@ccs}", X86::COND_S)
47241 .Default(X86::COND_INVALID);
47242 return Cond;
47243}
47244
47245/// Given a constraint letter, return the type of constraint for this target.
47246X86TargetLowering::ConstraintType
47247X86TargetLowering::getConstraintType(StringRef Constraint) const {
47248 if (Constraint.size() == 1) {
47249 switch (Constraint[0]) {
47250 case 'R':
47251 case 'q':
47252 case 'Q':
47253 case 'f':
47254 case 't':
47255 case 'u':
47256 case 'y':
47257 case 'x':
47258 case 'v':
47259 case 'Y':
47260 case 'l':
47261 case 'k': // AVX512 masking registers.
47262 return C_RegisterClass;
47263 case 'a':
47264 case 'b':
47265 case 'c':
47266 case 'd':
47267 case 'S':
47268 case 'D':
47269 case 'A':
47270 return C_Register;
47271 case 'I':
47272 case 'J':
47273 case 'K':
47274 case 'N':
47275 case 'G':
47276 case 'L':
47277 case 'M':
47278 return C_Immediate;
47279 case 'C':
47280 case 'e':
47281 case 'Z':
47282 return C_Other;
47283 default:
47284 break;
47285 }
47286 }
47287 else if (Constraint.size() == 2) {
47288 switch (Constraint[0]) {
47289 default:
47290 break;
47291 case 'Y':
47292 switch (Constraint[1]) {
47293 default:
47294 break;
47295 case 'z':
47296 case '0':
47297 return C_Register;
47298 case 'i':
47299 case 'm':
47300 case 'k':
47301 case 't':
47302 case '2':
47303 return C_RegisterClass;
47304 }
47305 }
47306 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
47307 return C_Other;
47308 return TargetLowering::getConstraintType(Constraint);
47309}
47310
47311/// Examine constraint type and operand type and determine a weight value.
47312/// This object must already have been set up with the operand type
47313/// and the current alternative constraint selected.
47314TargetLowering::ConstraintWeight
47315 X86TargetLowering::getSingleConstraintMatchWeight(
47316 AsmOperandInfo &info, const char *constraint) const {
47317 ConstraintWeight weight = CW_Invalid;
47318 Value *CallOperandVal = info.CallOperandVal;
47319 // If we don't have a value, we can't do a match,
47320 // but allow it at the lowest weight.
47321 if (!CallOperandVal)
47322 return CW_Default;
47323 Type *type = CallOperandVal->getType();
47324 // Look at the constraint type.
47325 switch (*constraint) {
47326 default:
47327 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
47328 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47329 case 'R':
47330 case 'q':
47331 case 'Q':
47332 case 'a':
47333 case 'b':
47334 case 'c':
47335 case 'd':
47336 case 'S':
47337 case 'D':
47338 case 'A':
47339 if (CallOperandVal->getType()->isIntegerTy())
47340 weight = CW_SpecificReg;
47341 break;
47342 case 'f':
47343 case 't':
47344 case 'u':
47345 if (type->isFloatingPointTy())
47346 weight = CW_SpecificReg;
47347 break;
47348 case 'y':
47349 if (type->isX86_MMXTy() && Subtarget.hasMMX())
47350 weight = CW_SpecificReg;
47351 break;
47352 case 'Y': {
47353 unsigned Size = StringRef(constraint).size();
47354 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
47355 char NextChar = Size == 2 ? constraint[1] : 'i';
47356 if (Size > 2)
47357 break;
47358 switch (NextChar) {
47359 default:
47360 return CW_Invalid;
47361 // XMM0
47362 case 'z':
47363 case '0':
47364 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
47365 return CW_SpecificReg;
47366 return CW_Invalid;
47367 // Conditional OpMask regs (AVX512)
47368 case 'k':
47369 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
47370 return CW_Register;
47371 return CW_Invalid;
47372 // Any MMX reg
47373 case 'm':
47374 if (type->isX86_MMXTy() && Subtarget.hasMMX())
47375 return weight;
47376 return CW_Invalid;
47377 // Any SSE reg when ISA >= SSE2, same as 'Y'
47378 case 'i':
47379 case 't':
47380 case '2':
47381 if (!Subtarget.hasSSE2())
47382 return CW_Invalid;
47383 break;
47384 }
47385 // Fall through (handle "Y" constraint).
47386 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47387 }
47388 case 'v':
47389 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
47390 weight = CW_Register;
47391 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47392 case 'x':
47393 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
47394 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
47395 weight = CW_Register;
47396 break;
47397 case 'k':
47398 // Enable conditional vector operations using %k<#> registers.
47399 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
47400 weight = CW_Register;
47401 break;
47402 case 'I':
47403 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
47404 if (C->getZExtValue() <= 31)
47405 weight = CW_Constant;
47406 }
47407 break;
47408 case 'J':
47409 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47410 if (C->getZExtValue() <= 63)
47411 weight = CW_Constant;
47412 }
47413 break;
47414 case 'K':
47415 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47416 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
47417 weight = CW_Constant;
47418 }
47419 break;
47420 case 'L':
47421 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47422 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
47423 weight = CW_Constant;
47424 }
47425 break;
47426 case 'M':
47427 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47428 if (C->getZExtValue() <= 3)
47429 weight = CW_Constant;
47430 }
47431 break;
47432 case 'N':
47433 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47434 if (C->getZExtValue() <= 0xff)
47435 weight = CW_Constant;
47436 }
47437 break;
47438 case 'G':
47439 case 'C':
47440 if (isa<ConstantFP>(CallOperandVal)) {
47441 weight = CW_Constant;
47442 }
47443 break;
47444 case 'e':
47445 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47446 if ((C->getSExtValue() >= -0x80000000LL) &&
47447 (C->getSExtValue() <= 0x7fffffffLL))
47448 weight = CW_Constant;
47449 }
47450 break;
47451 case 'Z':
47452 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47453 if (C->getZExtValue() <= 0xffffffff)
47454 weight = CW_Constant;
47455 }
47456 break;
47457 }
47458 return weight;
47459}
47460
47461/// Try to replace an X constraint, which matches anything, with another that
47462/// has more specific requirements based on the type of the corresponding
47463/// operand.
47464const char *X86TargetLowering::
47465LowerXConstraint(EVT ConstraintVT) const {
47466 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
47467 // 'f' like normal targets.
47468 if (ConstraintVT.isFloatingPoint()) {
47469 if (Subtarget.hasSSE2())
47470 return "Y";
47471 if (Subtarget.hasSSE1())
47472 return "x";
47473 }
47474
47475 return TargetLowering::LowerXConstraint(ConstraintVT);
47476}
47477
47478// Lower @cc targets via setcc.
47479SDValue X86TargetLowering::LowerAsmOutputForConstraint(
47480 SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
47481 SelectionDAG &DAG) const {
47482 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
47483 if (Cond == X86::COND_INVALID)
47484 return SDValue();
47485 // Check that return type is valid.
47486 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
47487 OpInfo.ConstraintVT.getSizeInBits() < 8)
47488 report_fatal_error("Flag output operand is of invalid type");
47489
47490 // Get EFLAGS register. Only update chain when copyfrom is glued.
47491 if (Flag.getNode()) {
47492 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
47493 Chain = Flag.getValue(1);
47494 } else
47495 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
47496 // Extract CC code.
47497 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
47498 // Extend to 32-bits
47499 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
47500
47501 return Result;
47502}
47503
47504/// Lower the specified operand into the Ops vector.
47505/// If it is invalid, don't add anything to Ops.
47506void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
47507 std::string &Constraint,
47508 std::vector<SDValue>&Ops,
47509 SelectionDAG &DAG) const {
47510 SDValue Result;
47511
47512 // Only support length 1 constraints for now.
47513 if (Constraint.length() > 1) return;
47514
47515 char ConstraintLetter = Constraint[0];
47516 switch (ConstraintLetter) {
47517 default: break;
47518 case 'I':
47519 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47520 if (C->getZExtValue() <= 31) {
47521 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47522 Op.getValueType());
47523 break;
47524 }
47525 }
47526 return;
47527 case 'J':
47528 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47529 if (C->getZExtValue() <= 63) {
47530 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47531 Op.getValueType());
47532 break;
47533 }
47534 }
47535 return;
47536 case 'K':
47537 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47538 if (isInt<8>(C->getSExtValue())) {
47539 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47540 Op.getValueType());
47541 break;
47542 }
47543 }
47544 return;
47545 case 'L':
47546 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47547 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
47548 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
47549 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
47550 Op.getValueType());
47551 break;
47552 }
47553 }
47554 return;
47555 case 'M':
47556 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47557 if (C->getZExtValue() <= 3) {
47558 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47559 Op.getValueType());
47560 break;
47561 }
47562 }
47563 return;
47564 case 'N':
47565 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47566 if (C->getZExtValue() <= 255) {
47567 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47568 Op.getValueType());
47569 break;
47570 }
47571 }
47572 return;
47573 case 'O':
47574 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47575 if (C->getZExtValue() <= 127) {
47576 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47577 Op.getValueType());
47578 break;
47579 }
47580 }
47581 return;
47582 case 'e': {
47583 // 32-bit signed value
47584 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47585 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
47586 C->getSExtValue())) {
47587 // Widen to 64 bits here to get it sign extended.
47588 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
47589 break;
47590 }
47591 // FIXME gcc accepts some relocatable values here too, but only in certain
47592 // memory models; it's complicated.
47593 }
47594 return;
47595 }
47596 case 'Z': {
47597 // 32-bit unsigned value
47598 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47599 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
47600 C->getZExtValue())) {
47601 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47602 Op.getValueType());
47603 break;
47604 }
47605 }
47606 // FIXME gcc accepts some relocatable values here too, but only in certain
47607 // memory models; it's complicated.
47608 return;
47609 }
47610 case 'i': {
47611 // Literal immediates are always ok.
47612 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
47613 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
47614 BooleanContent BCont = getBooleanContents(MVT::i64);
47615 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
47616 : ISD::SIGN_EXTEND;
47617 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
47618 : CST->getSExtValue();
47619 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
47620 break;
47621 }
47622
47623 // In any sort of PIC mode addresses need to be computed at runtime by
47624 // adding in a register or some sort of table lookup. These can't
47625 // be used as immediates.
47626 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
47627 return;
47628
47629 // If we are in non-pic codegen mode, we allow the address of a global (with
47630 // an optional displacement) to be used with 'i'.
47631 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
47632 // If we require an extra load to get this address, as in PIC mode, we
47633 // can't accept it.
47634 if (isGlobalStubReference(
47635 Subtarget.classifyGlobalReference(GA->getGlobal())))
47636 return;
47637 break;
47638 }
47639 }
47640
47641 if (Result.getNode()) {
47642 Ops.push_back(Result);
47643 return;
47644 }
47645 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
47646}
47647
47648/// Check if \p RC is a general purpose register class.
47649/// I.e., GR* or one of their variant.
47650static bool isGRClass(const TargetRegisterClass &RC) {
47651 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
47652 RC.hasSuperClassEq(&X86::GR16RegClass) ||
47653 RC.hasSuperClassEq(&X86::GR32RegClass) ||
47654 RC.hasSuperClassEq(&X86::GR64RegClass) ||
47655 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
47656}
47657
47658/// Check if \p RC is a vector register class.
47659/// I.e., FR* / VR* or one of their variant.
47660static bool isFRClass(const TargetRegisterClass &RC) {
47661 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
47662 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
47663 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
47664 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
47665 RC.hasSuperClassEq(&X86::VR512RegClass);
47666}
47667
47668/// Check if \p RC is a mask register class.
47669/// I.e., VK* or one of their variant.
47670static bool isVKClass(const TargetRegisterClass &RC) {
47671 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
47672 RC.hasSuperClassEq(&X86::VK2RegClass) ||
47673 RC.hasSuperClassEq(&X86::VK4RegClass) ||
47674 RC.hasSuperClassEq(&X86::VK8RegClass) ||
47675 RC.hasSuperClassEq(&X86::VK16RegClass) ||
47676 RC.hasSuperClassEq(&X86::VK32RegClass) ||
47677 RC.hasSuperClassEq(&X86::VK64RegClass);
47678}
47679
47680std::pair<unsigned, const TargetRegisterClass *>
47681X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
47682 StringRef Constraint,
47683 MVT VT) const {
47684 // First, see if this is a constraint that directly corresponds to an LLVM
47685 // register class.
47686 if (Constraint.size() == 1) {
47687 // GCC Constraint Letters
47688 switch (Constraint[0]) {
47689 default: break;
47690 // 'A' means [ER]AX + [ER]DX.
47691 case 'A':
47692 if (Subtarget.is64Bit())
47693 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
47694 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47695, __PRETTY_FUNCTION__))
47695 "Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47695, __PRETTY_FUNCTION__))
;
47696 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
47697
47698 // TODO: Slight differences here in allocation order and leaving
47699 // RIP in the class. Do they matter any more here than they do
47700 // in the normal allocation?
47701 case 'k':
47702 if (Subtarget.hasAVX512()) {
47703 if (VT == MVT::i1)
47704 return std::make_pair(0U, &X86::VK1RegClass);
47705 if (VT == MVT::i8)
47706 return std::make_pair(0U, &X86::VK8RegClass);
47707 if (VT == MVT::i16)
47708 return std::make_pair(0U, &X86::VK16RegClass);
47709 }
47710 if (Subtarget.hasBWI()) {
47711 if (VT == MVT::i32)
47712 return std::make_pair(0U, &X86::VK32RegClass);
47713 if (VT == MVT::i64)
47714 return std::make_pair(0U, &X86::VK64RegClass);
47715 }
47716 break;
47717 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
47718 if (Subtarget.is64Bit()) {
47719 if (VT == MVT::i32 || VT == MVT::f32)
47720 return std::make_pair(0U, &X86::GR32RegClass);
47721 if (VT == MVT::i16)
47722 return std::make_pair(0U, &X86::GR16RegClass);
47723 if (VT == MVT::i8 || VT == MVT::i1)
47724 return std::make_pair(0U, &X86::GR8RegClass);
47725 if (VT == MVT::i64 || VT == MVT::f64)
47726 return std::make_pair(0U, &X86::GR64RegClass);
47727 break;
47728 }
47729 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47730 // 32-bit fallthrough
47731 case 'Q': // Q_REGS
47732 if (VT == MVT::i32 || VT == MVT::f32)
47733 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
47734 if (VT == MVT::i16)
47735 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
47736 if (VT == MVT::i8 || VT == MVT::i1)
47737 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
47738 if (VT == MVT::i64)
47739 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
47740 break;
47741 case 'r': // GENERAL_REGS
47742 case 'l': // INDEX_REGS
47743 if (VT == MVT::i8 || VT == MVT::i1)
47744 return std::make_pair(0U, &X86::GR8RegClass);
47745 if (VT == MVT::i16)
47746 return std::make_pair(0U, &X86::GR16RegClass);
47747 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
47748 return std::make_pair(0U, &X86::GR32RegClass);
47749 return std::make_pair(0U, &X86::GR64RegClass);
47750 case 'R': // LEGACY_REGS
47751 if (VT == MVT::i8 || VT == MVT::i1)
47752 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
47753 if (VT == MVT::i16)
47754 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
47755 if (VT == MVT::i32 || !Subtarget.is64Bit())
47756 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
47757 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
47758 case 'f': // FP Stack registers.
47759 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
47760 // value to the correct fpstack register class.
47761 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
47762 return std::make_pair(0U, &X86::RFP32RegClass);
47763 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
47764 return std::make_pair(0U, &X86::RFP64RegClass);
47765 return std::make_pair(0U, &X86::RFP80RegClass);
47766 case 'y': // MMX_REGS if MMX allowed.
47767 if (!Subtarget.hasMMX()) break;
47768 return std::make_pair(0U, &X86::VR64RegClass);
47769 case 'Y': // SSE_REGS if SSE2 allowed
47770 if (!Subtarget.hasSSE2()) break;
47771 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47772 case 'v':
47773 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
47774 if (!Subtarget.hasSSE1()) break;
47775 bool VConstraint = (Constraint[0] == 'v');
47776
47777 switch (VT.SimpleTy) {
47778 default: break;
47779 // Scalar SSE types.
47780 case MVT::f32:
47781 case MVT::i32:
47782 if (VConstraint && Subtarget.hasVLX())
47783 return std::make_pair(0U, &X86::FR32XRegClass);
47784 return std::make_pair(0U, &X86::FR32RegClass);
47785 case MVT::f64:
47786 case MVT::i64:
47787 if (VConstraint && Subtarget.hasVLX())
47788 return std::make_pair(0U, &X86::FR64XRegClass);
47789 return std::make_pair(0U, &X86::FR64RegClass);
47790 // TODO: Handle i128 in FR128RegClass after it is tested well.
47791 // Vector types and fp128.
47792 case MVT::f128:
47793 case MVT::v16i8:
47794 case MVT::v8i16:
47795 case MVT::v4i32:
47796 case MVT::v2i64:
47797 case MVT::v4f32:
47798 case MVT::v2f64:
47799 if (VConstraint && Subtarget.hasVLX())
47800 return std::make_pair(0U, &X86::VR128XRegClass);
47801 return std::make_pair(0U, &X86::VR128RegClass);
47802 // AVX types.
47803 case MVT::v32i8:
47804 case MVT::v16i16:
47805 case MVT::v8i32:
47806 case MVT::v4i64:
47807 case MVT::v8f32:
47808 case MVT::v4f64:
47809 if (VConstraint && Subtarget.hasVLX())
47810 return std::make_pair(0U, &X86::VR256XRegClass);
47811 if (Subtarget.hasAVX())
47812 return std::make_pair(0U, &X86::VR256RegClass);
47813 break;
47814 case MVT::v8f64:
47815 case MVT::v16f32:
47816 case MVT::v16i32:
47817 case MVT::v8i64:
47818 if (!Subtarget.hasAVX512()) break;
47819 if (VConstraint)
47820 return std::make_pair(0U, &X86::VR512RegClass);
47821 return std::make_pair(0U, &X86::VR512_0_15RegClass);
47822 }
47823 break;
47824 }
47825 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
47826 switch (Constraint[1]) {
47827 default:
47828 break;
47829 case 'i':
47830 case 't':
47831 case '2':
47832 return getRegForInlineAsmConstraint(TRI, "Y", VT);
47833 case 'm':
47834 if (!Subtarget.hasMMX()) break;
47835 return std::make_pair(0U, &X86::VR64RegClass);
47836 case 'z':
47837 case '0':
47838 if (!Subtarget.hasSSE1()) break;
47839 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
47840 case 'k':
47841 // This register class doesn't allocate k0 for masked vector operation.
47842 if (Subtarget.hasAVX512()) {
47843 if (VT == MVT::i1)
47844 return std::make_pair(0U, &X86::VK1WMRegClass);
47845 if (VT == MVT::i8)
47846 return std::make_pair(0U, &X86::VK8WMRegClass);
47847 if (VT == MVT::i16)
47848 return std::make_pair(0U, &X86::VK16WMRegClass);
47849 }
47850 if (Subtarget.hasBWI()) {
47851 if (VT == MVT::i32)
47852 return std::make_pair(0U, &X86::VK32WMRegClass);
47853 if (VT == MVT::i64)
47854 return std::make_pair(0U, &X86::VK64WMRegClass);
47855 }
47856 break;
47857 }
47858 }
47859
47860 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
47861 return std::make_pair(0U, &X86::GR32RegClass);
47862
47863 // Use the default implementation in TargetLowering to convert the register
47864 // constraint into a member of a register class.
47865 std::pair<unsigned, const TargetRegisterClass*> Res;
47866 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
47867
47868 // Not found as a standard register?
47869 if (!Res.second) {
47870 // Map st(0) -> st(7) -> ST0
47871 if (Constraint.size() == 7 && Constraint[0] == '{' &&
47872 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
47873 Constraint[3] == '(' &&
47874 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
47875 Constraint[5] == ')' && Constraint[6] == '}') {
47876 // st(7) is not allocatable and thus not a member of RFP80. Return
47877 // singleton class in cases where we have a reference to it.
47878 if (Constraint[4] == '7')
47879 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
47880 return std::make_pair(X86::FP0 + Constraint[4] - '0',
47881 &X86::RFP80RegClass);
47882 }
47883
47884 // GCC allows "st(0)" to be called just plain "st".
47885 if (StringRef("{st}").equals_lower(Constraint))
47886 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
47887
47888 // flags -> EFLAGS
47889 if (StringRef("{flags}").equals_lower(Constraint))
47890 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
47891
47892 // dirflag -> DF
47893 if (StringRef("{dirflag}").equals_lower(Constraint))
47894 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
47895
47896 // fpsr -> FPSW
47897 if (StringRef("{fpsr}").equals_lower(Constraint))
47898 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
47899
47900 return Res;
47901 }
47902
47903 // Make sure it isn't a register that requires 64-bit mode.
47904 if (!Subtarget.is64Bit() &&
47905 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
47906 TRI->getEncodingValue(Res.first) >= 8) {
47907 // Register requires REX prefix, but we're in 32-bit mode.
47908 return std::make_pair(0, nullptr);
47909 }
47910
47911 // Make sure it isn't a register that requires AVX512.
47912 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
47913 TRI->getEncodingValue(Res.first) & 0x10) {
47914 // Register requires EVEX prefix.
47915 return std::make_pair(0, nullptr);
47916 }
47917
47918 // Otherwise, check to see if this is a register class of the wrong value
47919 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
47920 // turn into {ax},{dx}.
47921 // MVT::Other is used to specify clobber names.
47922 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
47923 return Res; // Correct type already, nothing to do.
47924
47925 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
47926 // return "eax". This should even work for things like getting 64bit integer
47927 // registers when given an f64 type.
47928 const TargetRegisterClass *Class = Res.second;
47929 // The generic code will match the first register class that contains the
47930 // given register. Thus, based on the ordering of the tablegened file,
47931 // the "plain" GR classes might not come first.
47932 // Therefore, use a helper method.
47933 if (isGRClass(*Class)) {
47934 unsigned Size = VT.getSizeInBits();
47935 if (Size == 1) Size = 8;
47936 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
47937 if (DestReg > 0) {
47938 bool is64Bit = Subtarget.is64Bit();
47939 const TargetRegisterClass *RC =
47940 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
47941 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
47942 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
47943 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
47944 : nullptr;
47945 if (Size == 64 && !is64Bit) {
47946 // Model GCC's behavior here and select a fixed pair of 32-bit
47947 // registers.
47948 switch (DestReg) {
47949 case X86::RAX:
47950 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
47951 case X86::RDX:
47952 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
47953 case X86::RCX:
47954 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
47955 case X86::RBX:
47956 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
47957 case X86::RSI:
47958 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
47959 case X86::RDI:
47960 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
47961 case X86::RBP:
47962 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
47963 default:
47964 return std::make_pair(0, nullptr);
47965 }
47966 }
47967 if (RC && RC->contains(DestReg))
47968 return std::make_pair(DestReg, RC);
47969 return Res;
47970 }
47971 // No register found/type mismatch.
47972 return std::make_pair(0, nullptr);
47973 } else if (isFRClass(*Class)) {
47974 // Handle references to XMM physical registers that got mapped into the
47975 // wrong class. This can happen with constraints like {xmm0} where the
47976 // target independent register mapper will just pick the first match it can
47977 // find, ignoring the required type.
47978
47979 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
47980 if (VT == MVT::f32 || VT == MVT::i32)
47981 Res.second = &X86::FR32XRegClass;
47982 else if (VT == MVT::f64 || VT == MVT::i64)
47983 Res.second = &X86::FR64XRegClass;
47984 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
47985 Res.second = &X86::VR128XRegClass;
47986 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
47987 Res.second = &X86::VR256XRegClass;
47988 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
47989 Res.second = &X86::VR512RegClass;
47990 else {
47991 // Type mismatch and not a clobber: Return an error;
47992 Res.first = 0;
47993 Res.second = nullptr;
47994 }
47995 } else if (isVKClass(*Class)) {
47996 if (VT == MVT::i1)
47997 Res.second = &X86::VK1RegClass;
47998 else if (VT == MVT::i8)
47999 Res.second = &X86::VK8RegClass;
48000 else if (VT == MVT::i16)
48001 Res.second = &X86::VK16RegClass;
48002 else if (VT == MVT::i32)
48003 Res.second = &X86::VK32RegClass;
48004 else if (VT == MVT::i64)
48005 Res.second = &X86::VK64RegClass;
48006 else {
48007 // Type mismatch and not a clobber: Return an error;
48008 Res.first = 0;
48009 Res.second = nullptr;
48010 }
48011 }
48012
48013 return Res;
48014}
48015
48016int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
48017 const AddrMode &AM, Type *Ty,
48018 unsigned AS) const {
48019 // Scaling factors are not free at all.
48020 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
48021 // will take 2 allocations in the out of order engine instead of 1
48022 // for plain addressing mode, i.e. inst (reg1).
48023 // E.g.,
48024 // vaddps (%rsi,%rdx), %ymm0, %ymm1
48025 // Requires two allocations (one for the load, one for the computation)
48026 // whereas:
48027 // vaddps (%rsi), %ymm0, %ymm1
48028 // Requires just 1 allocation, i.e., freeing allocations for other operations
48029 // and having less micro operations to execute.
48030 //
48031 // For some X86 architectures, this is even worse because for instance for
48032 // stores, the complex addressing mode forces the instruction to use the
48033 // "load" ports instead of the dedicated "store" port.
48034 // E.g., on Haswell:
48035 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
48036 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
48037 if (isLegalAddressingMode(DL, AM, Ty, AS))
48038 // Scale represents reg2 * scale, thus account for 1
48039 // as soon as we use a second register.
48040 return AM.Scale != 0;
48041 return -1;
48042}
48043
48044bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
48045 // Integer division on x86 is expensive. However, when aggressively optimizing
48046 // for code size, we prefer to use a div instruction, as it is usually smaller
48047 // than the alternative sequence.
48048 // The exception to this is vector division. Since x86 doesn't have vector
48049 // integer division, leaving the division as-is is a loss even in terms of
48050 // size, because it will have to be scalarized, while the alternative code
48051 // sequence can be performed in vector form.
48052 bool OptSize =
48053 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
48054 return OptSize && !VT.isVector();
48055}
48056
48057void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
48058 if (!Subtarget.is64Bit())
48059 return;
48060
48061 // Update IsSplitCSR in X86MachineFunctionInfo.
48062 X86MachineFunctionInfo *AFI =
48063 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
48064 AFI->setIsSplitCSR(true);
48065}
48066
48067void X86TargetLowering::insertCopiesSplitCSR(
48068 MachineBasicBlock *Entry,
48069 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
48070 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
48071 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
48072 if (!IStart)
48073 return;
48074
48075 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
48076 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
48077 MachineBasicBlock::iterator MBBI = Entry->begin();
48078 for (const MCPhysReg *I = IStart; *I; ++I) {
48079 const TargetRegisterClass *RC = nullptr;
48080 if (X86::GR64RegClass.contains(*I))
48081 RC = &X86::GR64RegClass;
48082 else
48083 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48083)
;
48084
48085 Register NewVR = MRI->createVirtualRegister(RC);
48086 // Create copy from CSR to a virtual register.
48087 // FIXME: this currently does not emit CFI pseudo-instructions, it works
48088 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
48089 // nounwind. If we want to generalize this later, we may need to emit
48090 // CFI pseudo-instructions.
48091 assert(((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48093, __PRETTY_FUNCTION__))
48092 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48093, __PRETTY_FUNCTION__))
48093 "Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48093, __PRETTY_FUNCTION__))
;
48094 Entry->addLiveIn(*I);
48095 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
48096 .addReg(*I);
48097
48098 // Insert the copy-back instructions right before the terminator.
48099 for (auto *Exit : Exits)
48100 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
48101 TII->get(TargetOpcode::COPY), *I)
48102 .addReg(NewVR);
48103 }
48104}
48105
48106bool X86TargetLowering::supportSwiftError() const {
48107 return Subtarget.is64Bit();
48108}
48109
48110/// Returns true if stack probing through a function call is requested.
48111bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
48112 return !getStackProbeSymbolName(MF).empty();
48113}
48114
48115/// Returns true if stack probing through inline assembly is requested.
48116bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
48117
48118 // No inline stack probe for Windows, they have their own mechanism.
48119 if (Subtarget.isOSWindows() ||
48120 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
48121 return false;
48122
48123 // If the function specifically requests inline stack probes, emit them.
48124 if (MF.getFunction().hasFnAttribute("probe-stack"))
48125 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
48126 "inline-asm";
48127
48128 return false;
48129}
48130
48131/// Returns the name of the symbol used to emit stack probes or the empty
48132/// string if not applicable.
48133StringRef
48134X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
48135 // Inline Stack probes disable stack probe call
48136 if (hasInlineStackProbe(MF))
48137 return "";
48138
48139 // If the function specifically requests stack probes, emit them.
48140 if (MF.getFunction().hasFnAttribute("probe-stack"))
48141 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
48142
48143 // Generally, if we aren't on Windows, the platform ABI does not include
48144 // support for stack probes, so don't emit them.
48145 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
48146 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
48147 return "";
48148
48149 // We need a stack probe to conform to the Windows ABI. Choose the right
48150 // symbol.
48151 if (Subtarget.is64Bit())
48152 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
48153 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
48154}
48155
48156unsigned
48157X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
48158 // The default stack probe size is 4096 if the function has no stackprobesize
48159 // attribute.
48160 unsigned StackProbeSize = 4096;
48161 const Function &Fn = MF.getFunction();
48162 if (Fn.hasFnAttribute("stack-probe-size"))
48163 Fn.getFnAttribute("stack-probe-size")
48164 .getValueAsString()
48165 .getAsInteger(0, StackProbeSize);
48166 return StackProbeSize;
48167}

/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h

1//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of machine-level target independent types which
10// legal values in the code generator use.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
15#define LLVM_SUPPORT_MACHINEVALUETYPE_H
16
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/TypeSize.h"
21#include <cassert>
22
23namespace llvm {
24
25 class Type;
26
27 /// Machine Value Type. Every type that is supported natively by some
28 /// processor targeted by LLVM occurs here. This means that any legal value
29 /// type can be represented by an MVT.
30 class MVT {
31 public:
32 enum SimpleValueType : uint8_t {
33 // Simple value types that aren't explicitly part of this enumeration
34 // are considered extended value types.
35 INVALID_SIMPLE_VALUE_TYPE = 0,
36
37 // If you change this numbering, you must change the values in
38 // ValueTypes.td as well!
39 Other = 1, // This is a non-standard value
40 i1 = 2, // This is a 1 bit integer value
41 i8 = 3, // This is an 8 bit integer value
42 i16 = 4, // This is a 16 bit integer value
43 i32 = 5, // This is a 32 bit integer value
44 i64 = 6, // This is a 64 bit integer value
45 i128 = 7, // This is a 128 bit integer value
46
47 FIRST_INTEGER_VALUETYPE = i1,
48 LAST_INTEGER_VALUETYPE = i128,
49
50 f16 = 8, // This is a 16 bit floating point value
51 f32 = 9, // This is a 32 bit floating point value
52 f64 = 10, // This is a 64 bit floating point value
53 f80 = 11, // This is a 80 bit floating point value
54 f128 = 12, // This is a 128 bit floating point value
55 ppcf128 = 13, // This is a PPC 128-bit floating point value
56
57 FIRST_FP_VALUETYPE = f16,
58 LAST_FP_VALUETYPE = ppcf128,
59
60 v1i1 = 14, // 1 x i1
61 v2i1 = 15, // 2 x i1
62 v4i1 = 16, // 4 x i1
63 v8i1 = 17, // 8 x i1
64 v16i1 = 18, // 16 x i1
65 v32i1 = 19, // 32 x i1
66 v64i1 = 20, // 64 x i1
67 v128i1 = 21, // 128 x i1
68 v256i1 = 22, // 256 x i1
69 v512i1 = 23, // 512 x i1
70 v1024i1 = 24, // 1024 x i1
71
72 v1i8 = 25, // 1 x i8
73 v2i8 = 26, // 2 x i8
74 v4i8 = 27, // 4 x i8
75 v8i8 = 28, // 8 x i8
76 v16i8 = 29, // 16 x i8
77 v32i8 = 30, // 32 x i8
78 v64i8 = 31, // 64 x i8
79 v128i8 = 32, //128 x i8
80 v256i8 = 33, //256 x i8
81
82 v1i16 = 34, // 1 x i16
83 v2i16 = 35, // 2 x i16
84 v3i16 = 36, // 3 x i16
85 v4i16 = 37, // 4 x i16
86 v8i16 = 38, // 8 x i16
87 v16i16 = 39, // 16 x i16
88 v32i16 = 40, // 32 x i16
89 v64i16 = 41, // 64 x i16
90 v128i16 = 42, //128 x i16
91
92 v1i32 = 43, // 1 x i32
93 v2i32 = 44, // 2 x i32
94 v3i32 = 45, // 3 x i32
95 v4i32 = 46, // 4 x i32
96 v5i32 = 47, // 5 x i32
97 v8i32 = 48, // 8 x i32
98 v16i32 = 49, // 16 x i32
99 v32i32 = 50, // 32 x i32
100 v64i32 = 51, // 64 x i32
101 v128i32 = 52, // 128 x i32
102 v256i32 = 53, // 256 x i32
103 v512i32 = 54, // 512 x i32
104 v1024i32 = 55, // 1024 x i32
105 v2048i32 = 56, // 2048 x i32
106
107 v1i64 = 57, // 1 x i64
108 v2i64 = 58, // 2 x i64
109 v4i64 = 59, // 4 x i64
110 v8i64 = 60, // 8 x i64
111 v16i64 = 61, // 16 x i64
112 v32i64 = 62, // 32 x i64
113
114 v1i128 = 63, // 1 x i128
115
116 FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
117 LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
118
119 v2f16 = 64, // 2 x f16
120 v3f16 = 65, // 3 x f16
121 v4f16 = 66, // 4 x f16
122 v8f16 = 67, // 8 x f16
123 v16f16 = 68, // 16 x f16
124 v32f16 = 69, // 32 x f16
125 v1f32 = 70, // 1 x f32
126 v2f32 = 71, // 2 x f32
127 v3f32 = 72, // 3 x f32
128 v4f32 = 73, // 4 x f32
129 v5f32 = 74, // 5 x f32
130 v8f32 = 75, // 8 x f32
131 v16f32 = 76, // 16 x f32
132 v32f32 = 77, // 32 x f32
133 v64f32 = 78, // 64 x f32
134 v128f32 = 79, // 128 x f32
135 v256f32 = 80, // 256 x f32
136 v512f32 = 81, // 512 x f32
137 v1024f32 = 82, // 1024 x f32
138 v2048f32 = 83, // 2048 x f32
139 v1f64 = 84, // 1 x f64
140 v2f64 = 85, // 2 x f64
141 v4f64 = 86, // 4 x f64
142 v8f64 = 87, // 8 x f64
143
144 FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
145 LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
146
147 FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
148 LAST_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
149
150 nxv1i1 = 88, // n x 1 x i1
151 nxv2i1 = 89, // n x 2 x i1
152 nxv4i1 = 90, // n x 4 x i1
153 nxv8i1 = 91, // n x 8 x i1
154 nxv16i1 = 92, // n x 16 x i1
155 nxv32i1 = 93, // n x 32 x i1
156
157 nxv1i8 = 94, // n x 1 x i8
158 nxv2i8 = 95, // n x 2 x i8
159 nxv4i8 = 96, // n x 4 x i8
160 nxv8i8 = 97, // n x 8 x i8
161 nxv16i8 = 98, // n x 16 x i8
162 nxv32i8 = 99, // n x 32 x i8
163
164 nxv1i16 = 100, // n x 1 x i16
165 nxv2i16 = 101, // n x 2 x i16
166 nxv4i16 = 102, // n x 4 x i16
167 nxv8i16 = 103, // n x 8 x i16
168 nxv16i16 = 104, // n x 16 x i16
169 nxv32i16 = 105, // n x 32 x i16
170
171 nxv1i32 = 106, // n x 1 x i32
172 nxv2i32 = 107, // n x 2 x i32
173 nxv4i32 = 108, // n x 4 x i32
174 nxv8i32 = 109, // n x 8 x i32
175 nxv16i32 = 110, // n x 16 x i32
176 nxv32i32 = 111, // n x 32 x i32
177
178 nxv1i64 = 112, // n x 1 x i64
179 nxv2i64 = 113, // n x 2 x i64
180 nxv4i64 = 114, // n x 4 x i64
181 nxv8i64 = 115, // n x 8 x i64
182 nxv16i64 = 116, // n x 16 x i64
183 nxv32i64 = 117, // n x 32 x i64
184
185 FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
186 LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
187
188 nxv2f16 = 118, // n x 2 x f16
189 nxv4f16 = 119, // n x 4 x f16
190 nxv8f16 = 120, // n x 8 x f16
191 nxv1f32 = 121, // n x 1 x f32
192 nxv2f32 = 122, // n x 2 x f32
193 nxv4f32 = 123, // n x 4 x f32
194 nxv8f32 = 124, // n x 8 x f32
195 nxv16f32 = 125, // n x 16 x f32
196 nxv1f64 = 126, // n x 1 x f64
197 nxv2f64 = 127, // n x 2 x f64
198 nxv4f64 = 128, // n x 4 x f64
199 nxv8f64 = 129, // n x 8 x f64
200
201 FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv2f16,
202 LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
203
204 FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
205 LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
206
207 FIRST_VECTOR_VALUETYPE = v1i1,
208 LAST_VECTOR_VALUETYPE = nxv8f64,
209
210 x86mmx = 130, // This is an X86 MMX value
211
212 Glue = 131, // This glues nodes together during pre-RA sched
213
214 isVoid = 132, // This has no value
215
216 Untyped = 133, // This value takes a register, but has
217 // unspecified type. The register class
218 // will be determined by the opcode.
219
220 exnref = 134, // WebAssembly's exnref type
221
222 FIRST_VALUETYPE = 1, // This is always the beginning of the list.
223 LAST_VALUETYPE = 135, // This always remains at the end of the list.
224
225 // This is the current maximum for LAST_VALUETYPE.
226 // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
227 // This value must be a multiple of 32.
228 MAX_ALLOWED_VALUETYPE = 160,
229
230 // A value of type llvm::TokenTy
231 token = 248,
232
233 // This is MDNode or MDString.
234 Metadata = 249,
235
236 // An int value the size of the pointer of the current
237 // target to any address space. This must only be used internal to
238 // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
239 iPTRAny = 250,
240
241 // A vector with any length and element size. This is used
242 // for intrinsics that have overloadings based on vector types.
243 // This is only for tblgen's consumption!
244 vAny = 251,
245
246 // Any floating-point or vector floating-point value. This is used
247 // for intrinsics that have overloadings based on floating-point types.
248 // This is only for tblgen's consumption!
249 fAny = 252,
250
251 // An integer or vector integer value of any bit width. This is
252 // used for intrinsics that have overloadings based on integer bit widths.
253 // This is only for tblgen's consumption!
254 iAny = 253,
255
256 // An int value the size of the pointer of the current
257 // target. This should only be used internal to tblgen!
258 iPTR = 254,
259
260 // Any type. This is used for intrinsics that have overloadings.
261 // This is only for tblgen's consumption!
262 Any = 255
263 };
264
265 SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
266
267 constexpr MVT() = default;
268 constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
269
270 bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; }
271 bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; }
272 bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; }
273 bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; }
274 bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
275 bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
276
277 /// Return true if this is a valid simple valuetype.
278 bool isValid() const {
279 return (SimpleTy >= MVT::FIRST_VALUETYPE &&
280 SimpleTy < MVT::LAST_VALUETYPE);
281 }
282
283 /// Return true if this is a FP or a vector FP type.
284 bool isFloatingPoint() const {
285 return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
286 SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
287 (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
288 SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
289 (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
290 SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
291 }
292
293 /// Return true if this is an integer or a vector integer type.
294 bool isInteger() const {
295 return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
296 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
297 (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
298 SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
299 (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
300 SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
301 }
302
303 /// Return true if this is an integer, not including vectors.
304 bool isScalarInteger() const {
305 return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
306 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
307 }
308
309 /// Return true if this is a vector value type.
310 bool isVector() const {
311 return (SimpleTy
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
>= MVT::FIRST_VECTOR_VALUETYPE
&&
7
Assuming field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
9
Returning the value 1, which participates in a condition later
21
Returning the value 1, which participates in a condition later
312 SimpleTy
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
<= MVT::LAST_VECTOR_VALUETYPE
)
;
8
Assuming field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
313 }
314
315 /// Return true if this is a vector value type where the
316 /// runtime length is machine dependent
317 bool isScalableVector() const {
318 return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
319 SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
320 }
321
322 bool isFixedLengthVector() const {
323 return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
324 SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
325 }
326
327 /// Return true if this is a 16-bit vector type.
328 bool is16BitVector() const {
329 return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 ||
330 SimpleTy == MVT::v16i1);
331 }
332
333 /// Return true if this is a 32-bit vector type.
334 bool is32BitVector() const {
335 return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 ||
336 SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
337 SimpleTy == MVT::v2f16 || SimpleTy == MVT::v1f32);
338 }
339
340 /// Return true if this is a 64-bit vector type.
341 bool is64BitVector() const {
342 return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 ||
343 SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
344 SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
345 SimpleTy == MVT::v2f32 || SimpleTy == MVT::v1f64);
346 }
347
348 /// Return true if this is a 128-bit vector type.
349 bool is128BitVector() const {
350 return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 ||
351 SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 ||
352 SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 ||
353 SimpleTy == MVT::v8f16 || SimpleTy == MVT::v4f32 ||
354 SimpleTy == MVT::v2f64);
355 }
356
357 /// Return true if this is a 256-bit vector type.
358 bool is256BitVector() const {
359 return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v8f32 ||
360 SimpleTy == MVT::v4f64 || SimpleTy == MVT::v32i8 ||
361 SimpleTy == MVT::v16i16 || SimpleTy == MVT::v8i32 ||
362 SimpleTy == MVT::v4i64 || SimpleTy == MVT::v256i1);
363 }
364
365 /// Return true if this is a 512-bit vector type.
366 bool is512BitVector() const {
367 return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v16f32 ||
368 SimpleTy == MVT::v8f64 || SimpleTy == MVT::v512i1 ||
369 SimpleTy == MVT::v64i8 || SimpleTy == MVT::v32i16 ||
370 SimpleTy == MVT::v16i32 || SimpleTy == MVT::v8i64);
371 }
372
373 /// Return true if this is a 1024-bit vector type.
374 bool is1024BitVector() const {
375 return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
376 SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 ||
377 SimpleTy == MVT::v16i64);
378 }
379
380 /// Return true if this is a 2048-bit vector type.
381 bool is2048BitVector() const {
382 return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
383 SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64);
384 }
385
386 /// Return true if this is an overloaded type for TableGen.
387 bool isOverloaded() const {
388 return (SimpleTy==MVT::Any ||
389 SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
390 SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
391 }
392
393 /// Return a VT for a vector type with the same element type but
394 /// half the number of elements.
395 MVT getHalfNumVectorElementsVT() const {
396 MVT EltVT = getVectorElementType();
397 auto EltCnt = getVectorElementCount();
398 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 398, __PRETTY_FUNCTION__))
;
399 return getVectorVT(EltVT, EltCnt / 2);
400 }
401
402 /// Returns true if the given vector is a power of 2.
403 bool isPow2VectorType() const {
404 unsigned NElts = getVectorNumElements();
405 return !(NElts & (NElts - 1));
406 }
407
408 /// Widens the length of the given vector MVT up to the nearest power of 2
409 /// and returns that type.
410 MVT getPow2VectorType() const {
411 if (isPow2VectorType())
412 return *this;
413
414 unsigned NElts = getVectorNumElements();
415 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
416 return MVT::getVectorVT(getVectorElementType(), Pow2NElts);
417 }
418
419 /// If this is a vector, return the element type, otherwise return this.
420 MVT getScalarType() const {
421 return isVector() ? getVectorElementType() : *this;
422 }
423
424 MVT getVectorElementType() const {
425 switch (SimpleTy) {
426 default:
427 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 427)
;
428 case v1i1:
429 case v2i1:
430 case v4i1:
431 case v8i1:
432 case v16i1:
433 case v32i1:
434 case v64i1:
435 case v128i1:
436 case v256i1:
437 case v512i1:
438 case v1024i1:
439 case nxv1i1:
440 case nxv2i1:
441 case nxv4i1:
442 case nxv8i1:
443 case nxv16i1:
444 case nxv32i1: return i1;
445 case v1i8:
446 case v2i8:
447 case v4i8:
448 case v8i8:
449 case v16i8:
450 case v32i8:
451 case v64i8:
452 case v128i8:
453 case v256i8:
454 case nxv1i8:
455 case nxv2i8:
456 case nxv4i8:
457 case nxv8i8:
458 case nxv16i8:
459 case nxv32i8: return i8;
460 case v1i16:
461 case v2i16:
462 case v3i16:
463 case v4i16:
464 case v8i16:
465 case v16i16:
466 case v32i16:
467 case v64i16:
468 case v128i16:
469 case nxv1i16:
470 case nxv2i16:
471 case nxv4i16:
472 case nxv8i16:
473 case nxv16i16:
474 case nxv32i16: return i16;
475 case v1i32:
476 case v2i32:
477 case v3i32:
478 case v4i32:
479 case v5i32:
480 case v8i32:
481 case v16i32:
482 case v32i32:
483 case v64i32:
484 case v128i32:
485 case v256i32:
486 case v512i32:
487 case v1024i32:
488 case v2048i32:
489 case nxv1i32:
490 case nxv2i32:
491 case nxv4i32:
492 case nxv8i32:
493 case nxv16i32:
494 case nxv32i32: return i32;
495 case v1i64:
496 case v2i64:
497 case v4i64:
498 case v8i64:
499 case v16i64:
500 case v32i64:
501 case nxv1i64:
502 case nxv2i64:
503 case nxv4i64:
504 case nxv8i64:
505 case nxv16i64:
506 case nxv32i64: return i64;
507 case v1i128: return i128;
508 case v2f16:
509 case v3f16:
510 case v4f16:
511 case v8f16:
512 case v16f16:
513 case v32f16:
514 case nxv2f16:
515 case nxv4f16:
516 case nxv8f16: return f16;
517 case v1f32:
518 case v2f32:
519 case v3f32:
520 case v4f32:
521 case v5f32:
522 case v8f32:
523 case v16f32:
524 case v32f32:
525 case v64f32:
526 case v128f32:
527 case v256f32:
528 case v512f32:
529 case v1024f32:
530 case v2048f32:
531 case nxv1f32:
532 case nxv2f32:
533 case nxv4f32:
534 case nxv8f32:
535 case nxv16f32: return f32;
536 case v1f64:
537 case v2f64:
538 case v4f64:
539 case v8f64:
540 case nxv1f64:
541 case nxv2f64:
542 case nxv4f64:
543 case nxv8f64: return f64;
544 }
545 }
546
547 unsigned getVectorNumElements() const {
548 switch (SimpleTy) {
549 default:
550 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 550)
;
551 case v2048i32:
552 case v2048f32: return 2048;
553 case v1024i1:
554 case v1024i32:
555 case v1024f32: return 1024;
556 case v512i1:
557 case v512i32:
558 case v512f32: return 512;
559 case v256i1:
560 case v256i8:
561 case v256i32:
562 case v256f32: return 256;
563 case v128i1:
564 case v128i8:
565 case v128i16:
566 case v128i32:
567 case v128f32: return 128;
568 case v64i1:
569 case v64i8:
570 case v64i16:
571 case v64i32:
572 case v64f32: return 64;
573 case v32i1:
574 case v32i8:
575 case v32i16:
576 case v32i32:
577 case v32i64:
578 case v32f16:
579 case v32f32:
580 case nxv32i1:
581 case nxv32i8:
582 case nxv32i16:
583 case nxv32i32:
584 case nxv32i64: return 32;
585 case v16i1:
586 case v16i8:
587 case v16i16:
588 case v16i32:
589 case v16i64:
590 case v16f16:
591 case v16f32:
592 case nxv16i1:
593 case nxv16i8:
594 case nxv16i16:
595 case nxv16i32:
596 case nxv16i64:
597 case nxv16f32: return 16;
598 case v8i1:
599 case v8i8:
600 case v8i16:
601 case v8i32:
602 case v8i64:
603 case v8f16:
604 case v8f32:
605 case v8f64:
606 case nxv8i1:
607 case nxv8i8:
608 case nxv8i16:
609 case nxv8i32:
610 case nxv8i64:
611 case nxv8f16:
612 case nxv8f32:
613 case nxv8f64: return 8;
614 case v5i32:
615 case v5f32: return 5;
616 case v4i1:
617 case v4i8:
618 case v4i16:
619 case v4i32:
620 case v4i64:
621 case v4f16:
622 case v4f32:
623 case v4f64:
624 case nxv4i1:
625 case nxv4i8:
626 case nxv4i16:
627 case nxv4i32:
628 case nxv4i64:
629 case nxv4f16:
630 case nxv4f32:
631 case nxv4f64: return 4;
632 case v3i16:
633 case v3i32:
634 case v3f16:
635 case v3f32: return 3;
636 case v2i1:
637 case v2i8:
638 case v2i16:
639 case v2i32:
640 case v2i64:
641 case v2f16:
642 case v2f32:
643 case v2f64:
644 case nxv2i1:
645 case nxv2i8:
646 case nxv2i16:
647 case nxv2i32:
648 case nxv2i64:
649 case nxv2f16:
650 case nxv2f32:
651 case nxv2f64: return 2;
652 case v1i1:
653 case v1i8:
654 case v1i16:
655 case v1i32:
656 case v1i64:
657 case v1i128:
658 case v1f32:
659 case v1f64:
660 case nxv1i1:
661 case nxv1i8:
662 case nxv1i16:
663 case nxv1i32:
664 case nxv1i64:
665 case nxv1f32:
666 case nxv1f64: return 1;
667 }
668 }
669
670 ElementCount getVectorElementCount() const {
671 return { getVectorNumElements(), isScalableVector() };
672 }
673
674 /// Returns the size of the specified MVT in bits.
675 ///
676 /// If the value type is a scalable vector type, the scalable property will
677 /// be set and the runtime size will be a positive integer multiple of the
678 /// base size.
679 TypeSize getSizeInBits() const {
680 switch (SimpleTy) {
681 default:
682 llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 682)
;
683 case Other:
684 llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 684)
;
685 case iPTR:
686 llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 686)
;
687 case iPTRAny:
688 case iAny:
689 case fAny:
690 case vAny:
691 case Any:
692 llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded."
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 692)
;
693 case token:
694 llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 695)
695 "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 695)
;
696 case Metadata:
697 llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 697)
;
698 case i1:
699 case v1i1: return TypeSize::Fixed(1);
700 case nxv1i1: return TypeSize::Scalable(1);
701 case v2i1: return TypeSize::Fixed(2);
702 case nxv2i1: return TypeSize::Scalable(2);
703 case v4i1: return TypeSize::Fixed(4);
704 case nxv4i1: return TypeSize::Scalable(4);
705 case i8 :
706 case v1i8:
707 case v8i1: return TypeSize::Fixed(8);
708 case nxv1i8:
709 case nxv8i1: return TypeSize::Scalable(8);
710 case i16 :
711 case f16:
712 case v16i1:
713 case v2i8:
714 case v1i16: return TypeSize::Fixed(16);
715 case nxv16i1:
716 case nxv2i8:
717 case nxv1i16: return TypeSize::Scalable(16);
718 case f32 :
719 case i32 :
720 case v32i1:
721 case v4i8:
722 case v2i16:
723 case v2f16:
724 case v1f32:
725 case v1i32: return TypeSize::Fixed(32);
726 case nxv32i1:
727 case nxv4i8:
728 case nxv2i16:
729 case nxv1i32:
730 case nxv2f16:
731 case nxv1f32: return TypeSize::Scalable(32);
732 case v3i16:
733 case v3f16: return TypeSize::Fixed(48);
734 case x86mmx:
735 case f64 :
736 case i64 :
737 case v64i1:
738 case v8i8:
739 case v4i16:
740 case v2i32:
741 case v1i64:
742 case v4f16:
743 case v2f32:
744 case v1f64: return TypeSize::Fixed(64);
745 case nxv8i8:
746 case nxv4i16:
747 case nxv2i32:
748 case nxv1i64:
749 case nxv4f16:
750 case nxv2f32:
751 case nxv1f64: return TypeSize::Scalable(64);
752 case f80 : return TypeSize::Fixed(80);
753 case v3i32:
754 case v3f32: return TypeSize::Fixed(96);
755 case f128:
756 case ppcf128:
757 case i128:
758 case v128i1:
759 case v16i8:
760 case v8i16:
761 case v4i32:
762 case v2i64:
763 case v1i128:
764 case v8f16:
765 case v4f32:
766 case v2f64: return TypeSize::Fixed(128);
767 case nxv16i8:
768 case nxv8i16:
769 case nxv4i32:
770 case nxv2i64:
771 case nxv8f16:
772 case nxv4f32:
773 case nxv2f64: return TypeSize::Scalable(128);
774 case v5i32:
775 case v5f32: return TypeSize::Fixed(160);
776 case v256i1:
777 case v32i8:
778 case v16i16:
779 case v8i32:
780 case v4i64:
781 case v16f16:
782 case v8f32:
783 case v4f64: return TypeSize::Fixed(256);
784 case nxv32i8:
785 case nxv16i16:
786 case nxv8i32:
787 case nxv4i64:
788 case nxv8f32:
789 case nxv4f64: return TypeSize::Scalable(256);
790 case v512i1:
791 case v64i8:
792 case v32i16:
793 case v16i32:
794 case v8i64:
795 case v32f16:
796 case v16f32:
797 case v8f64: return TypeSize::Fixed(512);
798 case nxv32i16:
799 case nxv16i32:
800 case nxv8i64:
801 case nxv16f32:
802 case nxv8f64: return TypeSize::Scalable(512);
803 case v1024i1:
804 case v128i8:
805 case v64i16:
806 case v32i32:
807 case v16i64:
808 case v32f32: return TypeSize::Fixed(1024);
809 case nxv32i32:
810 case nxv16i64: return TypeSize::Scalable(1024);
811 case v256i8:
812 case v128i16:
813 case v64i32:
814 case v32i64:
815 case v64f32: return TypeSize::Fixed(2048);
816 case nxv32i64: return TypeSize::Scalable(2048);
817 case v128i32:
818 case v128f32: return TypeSize::Fixed(4096);
819 case v256i32:
820 case v256f32: return TypeSize::Fixed(8192);
821 case v512i32:
822 case v512f32: return TypeSize::Fixed(16384);
823 case v1024i32:
824 case v1024f32: return TypeSize::Fixed(32768);
825 case v2048i32:
826 case v2048f32: return TypeSize::Fixed(65536);
827 case exnref: return TypeSize::Fixed(0); // opaque type
828 }
829 }
830
831 TypeSize getScalarSizeInBits() const {
832 return getScalarType().getSizeInBits();
833 }
834
835 /// Return the number of bytes overwritten by a store of the specified value
836 /// type.
837 ///
838 /// If the value type is a scalable vector type, the scalable property will
839 /// be set and the runtime size will be a positive integer multiple of the
840 /// base size.
841 TypeSize getStoreSize() const {
842 TypeSize BaseSize = getSizeInBits();
843 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
844 }
845
846 /// Return the number of bits overwritten by a store of the specified value
847 /// type.
848 ///
849 /// If the value type is a scalable vector type, the scalable property will
850 /// be set and the runtime size will be a positive integer multiple of the
851 /// base size.
852 TypeSize getStoreSizeInBits() const {
853 return getStoreSize() * 8;
854 }
855
856 /// Returns true if the number of bits for the type is a multiple of an
857 /// 8-bit byte.
858 bool isByteSized() const {
859 return getSizeInBits().isByteSized();
860 }
861
862 /// Return true if this has more bits than VT.
863 bool bitsGT(MVT VT) const {
864 return getSizeInBits() > VT.getSizeInBits();
865 }
866
867 /// Return true if this has no less bits than VT.
868 bool bitsGE(MVT VT) const {
869 return getSizeInBits() >= VT.getSizeInBits();
870 }
871
872 /// Return true if this has less bits than VT.
873 bool bitsLT(MVT VT) const {
874 return getSizeInBits() < VT.getSizeInBits();
875 }
876
877 /// Return true if this has no more bits than VT.
878 bool bitsLE(MVT VT) const {
879 return getSizeInBits() <= VT.getSizeInBits();
880 }
881
882 static MVT getFloatingPointVT(unsigned BitWidth) {
883 switch (BitWidth) {
884 default:
885 llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 885)
;
886 case 16:
887 return MVT::f16;
888 case 32:
889 return MVT::f32;
890 case 64:
891 return MVT::f64;
892 case 80:
893 return MVT::f80;
894 case 128:
895 return MVT::f128;
896 }
897 }
898
899 static MVT getIntegerVT(unsigned BitWidth) {
900 switch (BitWidth) {
901 default:
902 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
903 case 1:
904 return MVT::i1;
905 case 8:
906 return MVT::i8;
907 case 16:
908 return MVT::i16;
909 case 32:
910 return MVT::i32;
911 case 64:
912 return MVT::i64;
913 case 128:
914 return MVT::i128;
915 }
916 }
917
918 static MVT getVectorVT(MVT VT, unsigned NumElements) {
919 switch (VT.SimpleTy) {
920 default:
921 break;
922 case MVT::i1:
923 if (NumElements == 1) return MVT::v1i1;
924 if (NumElements == 2) return MVT::v2i1;
925 if (NumElements == 4) return MVT::v4i1;
926 if (NumElements == 8) return MVT::v8i1;
927 if (NumElements == 16) return MVT::v16i1;
928 if (NumElements == 32) return MVT::v32i1;
929 if (NumElements == 64) return MVT::v64i1;
930 if (NumElements == 128) return MVT::v128i1;
931 if (NumElements == 256) return MVT::v256i1;
932 if (NumElements == 512) return MVT::v512i1;
933 if (NumElements == 1024) return MVT::v1024i1;
934 break;
935 case MVT::i8:
936 if (NumElements == 1) return MVT::v1i8;
937 if (NumElements == 2) return MVT::v2i8;
938 if (NumElements == 4) return MVT::v4i8;
939 if (NumElements == 8) return MVT::v8i8;
940 if (NumElements == 16) return MVT::v16i8;
941 if (NumElements == 32) return MVT::v32i8;
942 if (NumElements == 64) return MVT::v64i8;
943 if (NumElements == 128) return MVT::v128i8;
944 if (NumElements == 256) return MVT::v256i8;
945 break;
946 case MVT::i16:
947 if (NumElements == 1) return MVT::v1i16;
948 if (NumElements == 2) return MVT::v2i16;
949 if (NumElements == 3) return MVT::v3i16;
950 if (NumElements == 4) return MVT::v4i16;
951 if (NumElements == 8) return MVT::v8i16;
952 if (NumElements == 16) return MVT::v16i16;
953 if (NumElements == 32) return MVT::v32i16;
954 if (NumElements == 64) return MVT::v64i16;
955 if (NumElements == 128) return MVT::v128i16;
956 break;
957 case MVT::i32:
958 if (NumElements == 1) return MVT::v1i32;
959 if (NumElements == 2) return MVT::v2i32;
960 if (NumElements == 3) return MVT::v3i32;
961 if (NumElements == 4) return MVT::v4i32;
962 if (NumElements == 5) return MVT::v5i32;
963 if (NumElements == 8) return MVT::v8i32;
964 if (NumElements == 16) return MVT::v16i32;
965 if (NumElements == 32) return MVT::v32i32;
966 if (NumElements == 64) return MVT::v64i32;
967 if (NumElements == 128) return MVT::v128i32;
968 if (NumElements == 256) return MVT::v256i32;
969 if (NumElements == 512) return MVT::v512i32;
970 if (NumElements == 1024) return MVT::v1024i32;
971 if (NumElements == 2048) return MVT::v2048i32;
972 break;
973 case MVT::i64:
974 if (NumElements == 1) return MVT::v1i64;
975 if (NumElements == 2) return MVT::v2i64;
976 if (NumElements == 4) return MVT::v4i64;
977 if (NumElements == 8) return MVT::v8i64;
978 if (NumElements == 16) return MVT::v16i64;
979 if (NumElements == 32) return MVT::v32i64;
980 break;
981 case MVT::i128:
982 if (NumElements == 1) return MVT::v1i128;
983 break;
984 case MVT::f16:
985 if (NumElements == 2) return MVT::v2f16;
986 if (NumElements == 3) return MVT::v3f16;
987 if (NumElements == 4) return MVT::v4f16;
988 if (NumElements == 8) return MVT::v8f16;
989 if (NumElements == 16) return MVT::v16f16;
990 if (NumElements == 32) return MVT::v32f16;
991 break;
992 case MVT::f32:
993 if (NumElements == 1) return MVT::v1f32;
994 if (NumElements == 2) return MVT::v2f32;
995 if (NumElements == 3) return MVT::v3f32;
996 if (NumElements == 4) return MVT::v4f32;
997 if (NumElements == 5) return MVT::v5f32;
998 if (NumElements == 8) return MVT::v8f32;
999 if (NumElements == 16) return MVT::v16f32;
1000 if (NumElements == 32) return MVT::v32f32;
1001 if (NumElements == 64) return MVT::v64f32;
1002 if (NumElements == 128) return MVT::v128f32;
1003 if (NumElements == 256) return MVT::v256f32;
1004 if (NumElements == 512) return MVT::v512f32;
1005 if (NumElements == 1024) return MVT::v1024f32;
1006 if (NumElements == 2048) return MVT::v2048f32;
1007 break;
1008 case MVT::f64:
1009 if (NumElements == 1) return MVT::v1f64;
1010 if (NumElements == 2) return MVT::v2f64;
1011 if (NumElements == 4) return MVT::v4f64;
1012 if (NumElements == 8) return MVT::v8f64;
1013 break;
1014 }
1015 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1016 }
1017
1018 static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
1019 switch(VT.SimpleTy) {
1020 default:
1021 break;
1022 case MVT::i1:
1023 if (NumElements == 1) return MVT::nxv1i1;
1024 if (NumElements == 2) return MVT::nxv2i1;
1025 if (NumElements == 4) return MVT::nxv4i1;
1026 if (NumElements == 8) return MVT::nxv8i1;
1027 if (NumElements == 16) return MVT::nxv16i1;
1028 if (NumElements == 32) return MVT::nxv32i1;
1029 break;
1030 case MVT::i8:
1031 if (NumElements == 1) return MVT::nxv1i8;
1032 if (NumElements == 2) return MVT::nxv2i8;
1033 if (NumElements == 4) return MVT::nxv4i8;
1034 if (NumElements == 8) return MVT::nxv8i8;
1035 if (NumElements == 16) return MVT::nxv16i8;
1036 if (NumElements == 32) return MVT::nxv32i8;
1037 break;
1038 case MVT::i16:
1039 if (NumElements == 1) return MVT::nxv1i16;
1040 if (NumElements == 2) return MVT::nxv2i16;
1041 if (NumElements == 4) return MVT::nxv4i16;
1042 if (NumElements == 8) return MVT::nxv8i16;
1043 if (NumElements == 16) return MVT::nxv16i16;
1044 if (NumElements == 32) return MVT::nxv32i16;
1045 break;
1046 case MVT::i32:
1047 if (NumElements == 1) return MVT::nxv1i32;
1048 if (NumElements == 2) return MVT::nxv2i32;
1049 if (NumElements == 4) return MVT::nxv4i32;
1050 if (NumElements == 8) return MVT::nxv8i32;
1051 if (NumElements == 16) return MVT::nxv16i32;
1052 if (NumElements == 32) return MVT::nxv32i32;
1053 break;
1054 case MVT::i64:
1055 if (NumElements == 1) return MVT::nxv1i64;
1056 if (NumElements == 2) return MVT::nxv2i64;
1057 if (NumElements == 4) return MVT::nxv4i64;
1058 if (NumElements == 8) return MVT::nxv8i64;
1059 if (NumElements == 16) return MVT::nxv16i64;
1060 if (NumElements == 32) return MVT::nxv32i64;
1061 break;
1062 case MVT::f16:
1063 if (NumElements == 2) return MVT::nxv2f16;
1064 if (NumElements == 4) return MVT::nxv4f16;
1065 if (NumElements == 8) return MVT::nxv8f16;
1066 break;
1067 case MVT::f32:
1068 if (NumElements == 1) return MVT::nxv1f32;
1069 if (NumElements == 2) return MVT::nxv2f32;
1070 if (NumElements == 4) return MVT::nxv4f32;
1071 if (NumElements == 8) return MVT::nxv8f32;
1072 if (NumElements == 16) return MVT::nxv16f32;
1073 break;
1074 case MVT::f64:
1075 if (NumElements == 1) return MVT::nxv1f64;
1076 if (NumElements == 2) return MVT::nxv2f64;
1077 if (NumElements == 4) return MVT::nxv4f64;
1078 if (NumElements == 8) return MVT::nxv8f64;
1079 break;
1080 }
1081 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1082 }
1083
1084 static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) {
1085 if (IsScalable)
1086 return getScalableVectorVT(VT, NumElements);
1087 return getVectorVT(VT, NumElements);
1088 }
1089
1090 static MVT getVectorVT(MVT VT, ElementCount EC) {
1091 if (EC.Scalable)
1092 return getScalableVectorVT(VT, EC.Min);
1093 return getVectorVT(VT, EC.Min);
1094 }
1095
1096 /// Return the value type corresponding to the specified type. This returns
1097 /// all pointers as iPTR. If HandleUnknown is true, unknown types are
1098 /// returned as Other, otherwise they are invalid.
1099 static MVT getVT(Type *Ty, bool HandleUnknown = false);
1100
1101 private:
1102 /// A simple iterator over the MVT::SimpleValueType enum.
1103 struct mvt_iterator {
1104 SimpleValueType VT;
1105
1106 mvt_iterator(SimpleValueType VT) : VT(VT) {}
1107
1108 MVT operator*() const { return VT; }
1109 bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
1110
1111 mvt_iterator& operator++() {
1112 VT = (MVT::SimpleValueType)((int)VT + 1);
1113 assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 1114, __PRETTY_FUNCTION__))
1114 "MVT iterator overflowed.")(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/Support/MachineValueType.h"
, 1114, __PRETTY_FUNCTION__))
;
1115 return *this;
1116 }
1117 };
1118
1119 /// A range of the MVT::SimpleValueType enum.
1120 using mvt_range = iterator_range<mvt_iterator>;
1121
1122 public:
1123 /// SimpleValueType Iteration
1124 /// @{
1125 static mvt_range all_valuetypes() {
1126 return mvt_range(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
1127 }
1128
1129 static mvt_range integer_valuetypes() {
1130 return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
1131 (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
1132 }
1133
1134 static mvt_range fp_valuetypes() {
1135 return mvt_range(MVT::FIRST_FP_VALUETYPE,
1136 (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
1137 }
1138
1139 static mvt_range vector_valuetypes() {
1140 return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
1141 (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
1142 }
1143
1144 static mvt_range fixedlen_vector_valuetypes() {
1145 return mvt_range(
1146 MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
1147 (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
1148 }
1149
1150 static mvt_range scalable_vector_valuetypes() {
1151 return mvt_range(
1152 MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
1153 (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
1154 }
1155
1156 static mvt_range integer_fixedlen_vector_valuetypes() {
1157 return mvt_range(
1158 MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
1159 (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
1160 }
1161
1162 static mvt_range fp_fixedlen_vector_valuetypes() {
1163 return mvt_range(
1164 MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
1165 (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
1166 }
1167
1168 static mvt_range integer_scalable_vector_valuetypes() {
1169 return mvt_range(
1170 MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
1171 (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
1172 }
1173
1174 static mvt_range fp_scalable_vector_valuetypes() {
1175 return mvt_range(
1176 MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
1177 (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
1178 }
1179 /// @}
1180 };
1181
1182} // end namespace llvm
1183
1184#endif // LLVM_CODEGEN_MACHINEVALUETYPE_H

/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h

1//===- CodeGen/ValueTypes.h - Low-Level Target independ. types --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of low-level target independent types which various
10// values in the code generator are. This allows the target specific behavior
11// of instructions to be described to target independent passes.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CODEGEN_VALUETYPES_H
16#define LLVM_CODEGEN_VALUETYPES_H
17
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MachineValueType.h"
20#include "llvm/Support/MathExtras.h"
21#include "llvm/Support/TypeSize.h"
22#include <cassert>
23#include <cstdint>
24#include <string>
25
26namespace llvm {
27
28 class LLVMContext;
29 class Type;
30
31 /// Extended Value Type. Capable of holding value types which are not native
32 /// for any processor (such as the i12345 type), as well as the types an MVT
33 /// can represent.
34 struct EVT {
35 private:
36 MVT V = MVT::INVALID_SIMPLE_VALUE_TYPE;
37 Type *LLVMTy = nullptr;
38
39 public:
40 constexpr EVT() = default;
41 constexpr EVT(MVT::SimpleValueType SVT) : V(SVT) {}
42 constexpr EVT(MVT S) : V(S) {}
43
44 bool operator==(EVT VT) const {
45 return !(*this != VT);
46 }
47 bool operator!=(EVT VT) const {
48 if (V.SimpleTy != VT.V.SimpleTy)
49 return true;
50 if (V.SimpleTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
51 return LLVMTy != VT.LLVMTy;
52 return false;
53 }
54
55 /// Returns the EVT that represents a floating-point type with the given
56 /// number of bits. There are two floating-point types with 128 bits - this
57 /// returns f128 rather than ppcf128.
58 static EVT getFloatingPointVT(unsigned BitWidth) {
59 return MVT::getFloatingPointVT(BitWidth);
60 }
61
62 /// Returns the EVT that represents an integer with the given number of
63 /// bits.
64 static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth) {
65 MVT M = MVT::getIntegerVT(BitWidth);
66 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
67 return M;
68 return getExtendedIntegerVT(Context, BitWidth);
69 }
70
71 /// Returns the EVT that represents a vector NumElements in length, where
72 /// each element is of type VT.
73 static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
74 bool IsScalable = false) {
75 MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable);
76 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
77 return M;
78
79 assert(!IsScalable && "We don't support extended scalable types yet")((!IsScalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!IsScalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 79, __PRETTY_FUNCTION__))
;
80 return getExtendedVectorVT(Context, VT, NumElements);
81 }
82
83 /// Returns the EVT that represents a vector EC.Min elements in length,
84 /// where each element is of type VT.
85 static EVT getVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
86 MVT M = MVT::getVectorVT(VT.V, EC);
87 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
88 return M;
89 assert (!EC.Scalable && "We don't support extended scalable types yet")((!EC.Scalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!EC.Scalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 89, __PRETTY_FUNCTION__))
;
90 return getExtendedVectorVT(Context, VT, EC.Min);
91 }
92
93 /// Return a vector with the same number of elements as this vector, but
94 /// with the element type converted to an integer type with the same
95 /// bitwidth.
96 EVT changeVectorElementTypeToInteger() const {
97 if (!isSimple()) {
98 assert (!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 99, __PRETTY_FUNCTION__))
99 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 99, __PRETTY_FUNCTION__))
;
100 return changeExtendedVectorElementTypeToInteger();
101 }
102 MVT EltTy = getSimpleVT().getVectorElementType();
103 unsigned BitWidth = EltTy.getSizeInBits();
104 MVT IntTy = MVT::getIntegerVT(BitWidth);
105 MVT VecTy = MVT::getVectorVT(IntTy, getVectorNumElements(),
106 isScalableVector());
107 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 108, __PRETTY_FUNCTION__))
108 "Simple vector VT not representable by simple integer vector VT!")((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 108, __PRETTY_FUNCTION__))
;
109 return VecTy;
110 }
111
112 /// Return the type converted to an equivalently sized integer or vector
113 /// with integer element type. Similar to changeVectorElementTypeToInteger,
114 /// but also handles scalars.
115 EVT changeTypeToInteger() {
116 if (isVector())
117 return changeVectorElementTypeToInteger();
118
119 if (isSimple())
120 return MVT::getIntegerVT(getSizeInBits());
121
122 return changeExtendedTypeToInteger();
123 }
124
125 /// Test if the given EVT is simple (as opposed to being extended).
126 bool isSimple() const {
127 return V.SimpleTy
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
!= MVT::INVALID_SIMPLE_VALUE_TYPE
;
16
Returning the value 1, which participates in a condition later
128 }
129
130 /// Test if the given EVT is extended (as opposed to being simple).
131 bool isExtended() const {
132 return !isSimple();
133 }
134
135 /// Return true if this is a FP or a vector FP type.
136 bool isFloatingPoint() const {
137 return isSimple() ? V.isFloatingPoint() : isExtendedFloatingPoint();
138 }
139
140 /// Return true if this is an integer or a vector integer type.
141 bool isInteger() const {
142 return isSimple() ? V.isInteger() : isExtendedInteger();
143 }
144
145 /// Return true if this is an integer, but not a vector.
146 bool isScalarInteger() const {
147 return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger();
148 }
149
150 /// Return true if this is a vector value type.
151 bool isVector() const {
152 return isSimple() ? V.isVector() : isExtendedVector();
19
'?' condition is true
20
Calling 'MVT::isVector'
22
Returning from 'MVT::isVector'
23
Returning the value 1, which participates in a condition later
153 }
154
155 /// Return true if this is a vector type where the runtime
156 /// length is machine dependent
157 bool isScalableVector() const {
158 // FIXME: We don't support extended scalable types yet, because the
159 // matching IR type doesn't exist. Once it has been added, this can
160 // be changed to call isExtendedScalableVector.
161 if (!isSimple())
162 return false;
163 return V.isScalableVector();
164 }
165
166 /// Return true if this is a 16-bit vector type.
167 bool is16BitVector() const {
168 return isSimple() ? V.is16BitVector() : isExtended16BitVector();
169 }
170
171 /// Return true if this is a 32-bit vector type.
172 bool is32BitVector() const {
173 return isSimple() ? V.is32BitVector() : isExtended32BitVector();
174 }
175
176 /// Return true if this is a 64-bit vector type.
177 bool is64BitVector() const {
178 return isSimple() ? V.is64BitVector() : isExtended64BitVector();
179 }
180
181 /// Return true if this is a 128-bit vector type.
182 bool is128BitVector() const {
183 return isSimple() ? V.is128BitVector() : isExtended128BitVector();
184 }
185
186 /// Return true if this is a 256-bit vector type.
187 bool is256BitVector() const {
188 return isSimple() ? V.is256BitVector() : isExtended256BitVector();
189 }
190
191 /// Return true if this is a 512-bit vector type.
192 bool is512BitVector() const {
193 return isSimple() ? V.is512BitVector() : isExtended512BitVector();
194 }
195
196 /// Return true if this is a 1024-bit vector type.
197 bool is1024BitVector() const {
198 return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
199 }
200
201 /// Return true if this is a 2048-bit vector type.
202 bool is2048BitVector() const {
203 return isSimple() ? V.is2048BitVector() : isExtended2048BitVector();
204 }
205
206 /// Return true if this is an overloaded type for TableGen.
207 bool isOverloaded() const {
208 return (V==MVT::iAny || V==MVT::fAny || V==MVT::vAny || V==MVT::iPTRAny);
209 }
210
211 /// Return true if the bit size is a multiple of 8.
212 bool isByteSized() const {
213 return getSizeInBits().isByteSized();
214 }
215
216 /// Return true if the size is a power-of-two number of bytes.
217 bool isRound() const {
218 if (isScalableVector())
219 return false;
220 unsigned BitSize = getSizeInBits();
221 return BitSize >= 8 && !(BitSize & (BitSize - 1));
222 }
223
224 /// Return true if this has the same number of bits as VT.
225 bool bitsEq(EVT VT) const {
226 if (EVT::operator==(VT)) return true;
227 return getSizeInBits() == VT.getSizeInBits();
228 }
229
230 /// Return true if this has more bits than VT.
231 bool bitsGT(EVT VT) const {
232 if (EVT::operator==(VT)) return false;
233 return getSizeInBits() > VT.getSizeInBits();
234 }
235
236 /// Return true if this has no less bits than VT.
237 bool bitsGE(EVT VT) const {
238 if (EVT::operator==(VT)) return true;
239 return getSizeInBits() >= VT.getSizeInBits();
240 }
241
242 /// Return true if this has less bits than VT.
243 bool bitsLT(EVT VT) const {
244 if (EVT::operator==(VT)) return false;
245 return getSizeInBits() < VT.getSizeInBits();
246 }
247
248 /// Return true if this has no more bits than VT.
249 bool bitsLE(EVT VT) const {
250 if (EVT::operator==(VT)) return true;
251 return getSizeInBits() <= VT.getSizeInBits();
252 }
253
254 /// Return the SimpleValueType held in the specified simple EVT.
255 MVT getSimpleVT() const {
256 assert(isSimple() && "Expected a SimpleValueType!")((isSimple() && "Expected a SimpleValueType!") ? static_cast
<void> (0) : __assert_fail ("isSimple() && \"Expected a SimpleValueType!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 256, __PRETTY_FUNCTION__))
;
257 return V;
258 }
259
260 /// If this is a vector type, return the element type, otherwise return
261 /// this.
262 EVT getScalarType() const {
263 return isVector() ? getVectorElementType() : *this;
264 }
265
266 /// Given a vector type, return the type of each element.
267 EVT getVectorElementType() const {
268 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 268, __PRETTY_FUNCTION__))
;
269 if (isSimple())
270 return V.getVectorElementType();
271 return getExtendedVectorElementType();
272 }
273
274 /// Given a vector type, return the number of elements it contains.
275 unsigned getVectorNumElements() const {
276 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 276, __PRETTY_FUNCTION__))
;
277 if (isSimple())
278 return V.getVectorNumElements();
279 return getExtendedVectorNumElements();
280 }
281
282 // Given a (possibly scalable) vector type, return the ElementCount
283 ElementCount getVectorElementCount() const {
284 assert((isVector()) && "Invalid vector type!")(((isVector()) && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("(isVector()) && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 284, __PRETTY_FUNCTION__))
;
285 if (isSimple())
286 return V.getVectorElementCount();
287
288 assert(!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 289, __PRETTY_FUNCTION__))
289 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 289, __PRETTY_FUNCTION__))
;
290 return {getExtendedVectorNumElements(), false};
291 }
292
293 /// Return the size of the specified value type in bits.
294 ///
295 /// If the value type is a scalable vector type, the scalable property will
296 /// be set and the runtime size will be a positive integer multiple of the
297 /// base size.
298 TypeSize getSizeInBits() const {
299 if (isSimple())
300 return V.getSizeInBits();
301 return getExtendedSizeInBits();
302 }
303
304 TypeSize getScalarSizeInBits() const {
305 return getScalarType().getSizeInBits();
306 }
307
308 /// Return the number of bytes overwritten by a store of the specified value
309 /// type.
310 ///
311 /// If the value type is a scalable vector type, the scalable property will
312 /// be set and the runtime size will be a positive integer multiple of the
313 /// base size.
314 TypeSize getStoreSize() const {
315 TypeSize BaseSize = getSizeInBits();
316 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
317 }
318
319 /// Return the number of bits overwritten by a store of the specified value
320 /// type.
321 ///
322 /// If the value type is a scalable vector type, the scalable property will
323 /// be set and the runtime size will be a positive integer multiple of the
324 /// base size.
325 TypeSize getStoreSizeInBits() const {
326 return getStoreSize() * 8;
327 }
328
329 /// Rounds the bit-width of the given integer EVT up to the nearest power of
330 /// two (and at least to eight), and returns the integer EVT with that
331 /// number of bits.
332 EVT getRoundIntegerType(LLVMContext &Context) const {
333 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 333, __PRETTY_FUNCTION__))
;
334 unsigned BitWidth = getSizeInBits();
335 if (BitWidth <= 8)
336 return EVT(MVT::i8);
337 return getIntegerVT(Context, 1 << Log2_32_Ceil(BitWidth));
338 }
339
340 /// Finds the smallest simple value type that is greater than or equal to
341 /// half the width of this EVT. If no simple value type can be found, an
342 /// extended integer value type of half the size (rounded up) is returned.
343 EVT getHalfSizedIntegerVT(LLVMContext &Context) const {
344 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 344, __PRETTY_FUNCTION__))
;
345 unsigned EVTSize = getSizeInBits();
346 for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
347 IntVT <= MVT::LAST_INTEGER_VALUETYPE; ++IntVT) {
348 EVT HalfVT = EVT((MVT::SimpleValueType)IntVT);
349 if (HalfVT.getSizeInBits() * 2 >= EVTSize)
350 return HalfVT;
351 }
352 return getIntegerVT(Context, (EVTSize + 1) / 2);
353 }
354
355 /// Return a VT for an integer vector type with the size of the
356 /// elements doubled. The typed returned may be an extended type.
357 EVT widenIntegerVectorElementType(LLVMContext &Context) const {
358 EVT EltVT = getVectorElementType();
359 EltVT = EVT::getIntegerVT(Context, 2 * EltVT.getSizeInBits());
360 return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
361 }
362
363 // Return a VT for a vector type with the same element type but
364 // half the number of elements. The type returned may be an
365 // extended type.
366 EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
367 EVT EltVT = getVectorElementType();
368 auto EltCnt = getVectorElementCount();
369 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/CodeGen/ValueTypes.h"
, 369, __PRETTY_FUNCTION__))
;
370 return EVT::getVectorVT(Context, EltVT, EltCnt / 2);
371 }
372
373 /// Returns true if the given vector is a power of 2.
374 bool isPow2VectorType() const {
375 unsigned NElts = getVectorNumElements();
376 return !(NElts & (NElts - 1));
377 }
378
379 /// Widens the length of the given vector EVT up to the nearest power of 2
380 /// and returns that type.
381 EVT getPow2VectorType(LLVMContext &Context) const {
382 if (!isPow2VectorType()) {
383 unsigned NElts = getVectorNumElements();
384 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
385 return EVT::getVectorVT(Context, getVectorElementType(), Pow2NElts,
386 isScalableVector());
387 }
388 else {
389 return *this;
390 }
391 }
392
393 /// This function returns value type as a string, e.g. "i32".
394 std::string getEVTString() const;
395
396 /// This method returns an LLVM type corresponding to the specified EVT.
397 /// For integer types, this returns an unsigned type. Note that this will
398 /// abort for types that cannot be represented.
399 Type *getTypeForEVT(LLVMContext &Context) const;
400
401 /// Return the value type corresponding to the specified type.
402 /// This returns all pointers as iPTR. If HandleUnknown is true, unknown
403 /// types are returned as Other, otherwise they are invalid.
404 static EVT getEVT(Type *Ty, bool HandleUnknown = false);
405
406 intptr_t getRawBits() const {
407 if (isSimple())
408 return V.SimpleTy;
409 else
410 return (intptr_t)(LLVMTy);
411 }
412
413 /// A meaningless but well-behaved order, useful for constructing
414 /// containers.
415 struct compareRawBits {
416 bool operator()(EVT L, EVT R) const {
417 if (L.V.SimpleTy == R.V.SimpleTy)
418 return L.LLVMTy < R.LLVMTy;
419 else
420 return L.V.SimpleTy < R.V.SimpleTy;
421 }
422 };
423
424 private:
425 // Methods for handling the Extended-type case in functions above.
426 // These are all out-of-line to prevent users of this header file
427 // from having a dependency on Type.h.
428 EVT changeExtendedTypeToInteger() const;
429 EVT changeExtendedVectorElementTypeToInteger() const;
430 static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth);
431 static EVT getExtendedVectorVT(LLVMContext &C, EVT VT,
432 unsigned NumElements);
433 bool isExtendedFloatingPoint() const LLVM_READONLY__attribute__((__pure__));
434 bool isExtendedInteger() const LLVM_READONLY__attribute__((__pure__));
435 bool isExtendedScalarInteger() const LLVM_READONLY__attribute__((__pure__));
436 bool isExtendedVector() const LLVM_READONLY__attribute__((__pure__));
437 bool isExtended16BitVector() const LLVM_READONLY__attribute__((__pure__));
438 bool isExtended32BitVector() const LLVM_READONLY__attribute__((__pure__));
439 bool isExtended64BitVector() const LLVM_READONLY__attribute__((__pure__));
440 bool isExtended128BitVector() const LLVM_READONLY__attribute__((__pure__));
441 bool isExtended256BitVector() const LLVM_READONLY__attribute__((__pure__));
442 bool isExtended512BitVector() const LLVM_READONLY__attribute__((__pure__));
443 bool isExtended1024BitVector() const LLVM_READONLY__attribute__((__pure__));
444 bool isExtended2048BitVector() const LLVM_READONLY__attribute__((__pure__));
445 EVT getExtendedVectorElementType() const;
446 unsigned getExtendedVectorNumElements() const LLVM_READONLY__attribute__((__pure__));
447 TypeSize getExtendedSizeInBits() const LLVM_READONLY__attribute__((__pure__));
448 };
449
450} // end namespace llvm
451
452#endif // LLVM_CODEGEN_VALUETYPES_H

/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h

1//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallVector class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_ADT_SMALLVECTOR_H
14#define LLVM_ADT_SMALLVECTOR_H
15
16#include "llvm/ADT/iterator_range.h"
17#include "llvm/Support/AlignOf.h"
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/MemAlloc.h"
21#include "llvm/Support/type_traits.h"
22#include "llvm/Support/ErrorHandling.h"
23#include <algorithm>
24#include <cassert>
25#include <cstddef>
26#include <cstdlib>
27#include <cstring>
28#include <initializer_list>
29#include <iterator>
30#include <memory>
31#include <new>
32#include <type_traits>
33#include <utility>
34
35namespace llvm {
36
37/// This is all the non-templated stuff common to all SmallVectors.
38class SmallVectorBase {
39protected:
40 void *BeginX;
41 unsigned Size = 0, Capacity;
42
43 SmallVectorBase() = delete;
44 SmallVectorBase(void *FirstEl, size_t TotalCapacity)
45 : BeginX(FirstEl), Capacity(TotalCapacity) {}
46
47 /// This is an implementation of the grow() method which only works
48 /// on POD-like data types and is out of line to reduce code duplication.
49 void grow_pod(void *FirstEl, size_t MinCapacity, size_t TSize);
50
51public:
52 size_t size() const { return Size; }
46
Returning zero
53 size_t capacity() const { return Capacity; }
54
55 LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return !Size; }
56
57 /// Set the array size to \p N, which the current array must have enough
58 /// capacity for.
59 ///
60 /// This does not construct or destroy any elements in the vector.
61 ///
62 /// Clients can use this in conjunction with capacity() to write past the end
63 /// of the buffer when they know that more elements are available, and only
64 /// update the size later. This avoids the cost of value initializing elements
65 /// which will only be overwritten.
66 void set_size(size_t N) {
67 assert(N <= capacity())((N <= capacity()) ? static_cast<void> (0) : __assert_fail
("N <= capacity()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 67, __PRETTY_FUNCTION__))
;
68 Size = N;
69 }
70};
71
72/// Figure out the offset of the first element.
73template <class T, typename = void> struct SmallVectorAlignmentAndSize {
74 AlignedCharArrayUnion<SmallVectorBase> Base;
75 AlignedCharArrayUnion<T> FirstEl;
76};
77
78/// This is the part of SmallVectorTemplateBase which does not depend on whether
79/// the type T is a POD. The extra dummy template argument is used by ArrayRef
80/// to avoid unnecessarily requiring T to be complete.
81template <typename T, typename = void>
82class SmallVectorTemplateCommon : public SmallVectorBase {
83 /// Find the address of the first element. For this pointer math to be valid
84 /// with small-size of 0 for T with lots of alignment, it's important that
85 /// SmallVectorStorage is properly-aligned even for small-size of 0.
86 void *getFirstEl() const {
87 return const_cast<void *>(reinterpret_cast<const void *>(
88 reinterpret_cast<const char *>(this) +
89 offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)__builtin_offsetof(SmallVectorAlignmentAndSize<T>, FirstEl
)
));
90 }
91 // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
92
93protected:
94 SmallVectorTemplateCommon(size_t Size)
95 : SmallVectorBase(getFirstEl(), Size) {}
96
97 void grow_pod(size_t MinCapacity, size_t TSize) {
98 SmallVectorBase::grow_pod(getFirstEl(), MinCapacity, TSize);
99 }
100
101 /// Return true if this is a smallvector which has not had dynamic
102 /// memory allocated for it.
103 bool isSmall() const { return BeginX == getFirstEl(); }
104
105 /// Put this vector in a state of being small.
106 void resetToSmall() {
107 BeginX = getFirstEl();
108 Size = Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
109 }
110
111public:
112 using size_type = size_t;
113 using difference_type = ptrdiff_t;
114 using value_type = T;
115 using iterator = T *;
116 using const_iterator = const T *;
117
118 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
119 using reverse_iterator = std::reverse_iterator<iterator>;
120
121 using reference = T &;
122 using const_reference = const T &;
123 using pointer = T *;
124 using const_pointer = const T *;
125
126 // forward iterator creation methods.
127 iterator begin() { return (iterator)this->BeginX; }
128 const_iterator begin() const { return (const_iterator)this->BeginX; }
129 iterator end() { return begin() + size(); }
130 const_iterator end() const { return begin() + size(); }
131
132 // reverse iterator creation methods.
133 reverse_iterator rbegin() { return reverse_iterator(end()); }
134 const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
135 reverse_iterator rend() { return reverse_iterator(begin()); }
136 const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
137
138 size_type size_in_bytes() const { return size() * sizeof(T); }
139 size_type max_size() const { return size_type(-1) / sizeof(T); }
140
141 size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
142
143 /// Return a pointer to the vector's buffer, even if empty().
144 pointer data() { return pointer(begin()); }
145 /// Return a pointer to the vector's buffer, even if empty().
146 const_pointer data() const { return const_pointer(begin()); }
147
148 reference operator[](size_type idx) {
149 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 149, __PRETTY_FUNCTION__))
;
150 return begin()[idx];
151 }
152 const_reference operator[](size_type idx) const {
153 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 153, __PRETTY_FUNCTION__))
;
154 return begin()[idx];
155 }
156
157 reference front() {
158 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 158, __PRETTY_FUNCTION__))
;
159 return begin()[0];
160 }
161 const_reference front() const {
162 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 162, __PRETTY_FUNCTION__))
;
163 return begin()[0];
164 }
165
166 reference back() {
167 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 167, __PRETTY_FUNCTION__))
;
168 return end()[-1];
169 }
170 const_reference back() const {
171 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 171, __PRETTY_FUNCTION__))
;
172 return end()[-1];
173 }
174};
175
176/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put method
177/// implementations that are designed to work with non-POD-like T's.
178template <typename T, bool = is_trivially_copyable<T>::value>
179class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
180protected:
181 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
182
183 static void destroy_range(T *S, T *E) {
184 while (S != E) {
185 --E;
186 E->~T();
187 }
188 }
189
190 /// Move the range [I, E) into the uninitialized memory starting with "Dest",
191 /// constructing elements as needed.
192 template<typename It1, typename It2>
193 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
194 std::uninitialized_copy(std::make_move_iterator(I),
195 std::make_move_iterator(E), Dest);
196 }
197
198 /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
199 /// constructing elements as needed.
200 template<typename It1, typename It2>
201 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
202 std::uninitialized_copy(I, E, Dest);
203 }
204
205 /// Grow the allocated memory (without initializing new elements), doubling
206 /// the size of the allocated memory. Guarantees space for at least one more
207 /// element, or MinSize more elements if specified.
208 void grow(size_t MinSize = 0);
209
210public:
211 void push_back(const T &Elt) {
212 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
213 this->grow();
214 ::new ((void*) this->end()) T(Elt);
215 this->set_size(this->size() + 1);
216 }
217
218 void push_back(T &&Elt) {
219 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
220 this->grow();
221 ::new ((void*) this->end()) T(::std::move(Elt));
222 this->set_size(this->size() + 1);
223 }
224
225 void pop_back() {
226 this->set_size(this->size() - 1);
227 this->end()->~T();
228 }
229};
230
231// Define this out-of-line to dissuade the C++ compiler from inlining it.
232template <typename T, bool TriviallyCopyable>
233void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
234 if (MinSize > UINT32_MAX(4294967295U))
235 report_bad_alloc_error("SmallVector capacity overflow during allocation");
236
237 // Always grow, even from zero.
238 size_t NewCapacity = size_t(NextPowerOf2(this->capacity() + 2));
239 NewCapacity = std::min(std::max(NewCapacity, MinSize), size_t(UINT32_MAX(4294967295U)));
240 T *NewElts = static_cast<T*>(llvm::safe_malloc(NewCapacity*sizeof(T)));
241
242 // Move the elements over.
243 this->uninitialized_move(this->begin(), this->end(), NewElts);
244
245 // Destroy the original elements.
246 destroy_range(this->begin(), this->end());
247
248 // If this wasn't grown from the inline copy, deallocate the old space.
249 if (!this->isSmall())
250 free(this->begin());
251
252 this->BeginX = NewElts;
253 this->Capacity = NewCapacity;
254}
255
256/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
257/// method implementations that are designed to work with POD-like T's.
258template <typename T>
259class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
260protected:
261 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
262
263 // No need to do a destroy loop for POD's.
264 static void destroy_range(T *, T *) {}
265
266 /// Move the range [I, E) onto the uninitialized memory
267 /// starting with "Dest", constructing elements into it as needed.
268 template<typename It1, typename It2>
269 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
270 // Just do a copy.
271 uninitialized_copy(I, E, Dest);
272 }
273
274 /// Copy the range [I, E) onto the uninitialized memory
275 /// starting with "Dest", constructing elements into it as needed.
276 template<typename It1, typename It2>
277 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
278 // Arbitrary iterator types; just use the basic implementation.
279 std::uninitialized_copy(I, E, Dest);
280 }
281
282 /// Copy the range [I, E) onto the uninitialized memory
283 /// starting with "Dest", constructing elements into it as needed.
284 template <typename T1, typename T2>
285 static void uninitialized_copy(
286 T1 *I, T1 *E, T2 *Dest,
287 std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
288 T2>::value> * = nullptr) {
289 // Use memcpy for PODs iterated by pointers (which includes SmallVector
290 // iterators): std::uninitialized_copy optimizes to memmove, but we can
291 // use memcpy here. Note that I and E are iterators and thus might be
292 // invalid for memcpy if they are equal.
293 if (I != E)
294 memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
295 }
296
297 /// Double the size of the allocated memory, guaranteeing space for at
298 /// least one more element or MinSize if specified.
299 void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
300
301public:
302 void push_back(const T &Elt) {
303 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
304 this->grow();
305 memcpy(reinterpret_cast<void *>(this->end()), &Elt, sizeof(T));
306 this->set_size(this->size() + 1);
307 }
308
309 void pop_back() { this->set_size(this->size() - 1); }
310};
311
312/// This class consists of common code factored out of the SmallVector class to
313/// reduce code duplication based on the SmallVector 'N' template parameter.
314template <typename T>
315class SmallVectorImpl : public SmallVectorTemplateBase<T> {
316 using SuperClass = SmallVectorTemplateBase<T>;
317
318public:
319 using iterator = typename SuperClass::iterator;
320 using const_iterator = typename SuperClass::const_iterator;
321 using reference = typename SuperClass::reference;
322 using size_type = typename SuperClass::size_type;
323
324protected:
325 // Default ctor - Initialize to empty.
326 explicit SmallVectorImpl(unsigned N)
327 : SmallVectorTemplateBase<T>(N) {}
328
329public:
330 SmallVectorImpl(const SmallVectorImpl &) = delete;
331
332 ~SmallVectorImpl() {
333 // Subclass has already destructed this vector's elements.
334 // If this wasn't grown from the inline copy, deallocate the old space.
335 if (!this->isSmall())
336 free(this->begin());
337 }
338
339 void clear() {
340 this->destroy_range(this->begin(), this->end());
341 this->Size = 0;
342 }
343
344 void resize(size_type N) {
345 if (N < this->size()) {
346 this->destroy_range(this->begin()+N, this->end());
347 this->set_size(N);
348 } else if (N > this->size()) {
349 if (this->capacity() < N)
350 this->grow(N);
351 for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
352 new (&*I) T();
353 this->set_size(N);
354 }
355 }
356
357 void resize(size_type N, const T &NV) {
358 if (N < this->size()) {
359 this->destroy_range(this->begin()+N, this->end());
360 this->set_size(N);
361 } else if (N > this->size()) {
362 if (this->capacity() < N)
363 this->grow(N);
364 std::uninitialized_fill(this->end(), this->begin()+N, NV);
365 this->set_size(N);
366 }
367 }
368
369 void reserve(size_type N) {
370 if (this->capacity() < N)
371 this->grow(N);
372 }
373
374 LLVM_NODISCARD[[clang::warn_unused_result]] T pop_back_val() {
375 T Result = ::std::move(this->back());
376 this->pop_back();
377 return Result;
378 }
379
380 void swap(SmallVectorImpl &RHS);
381
382 /// Add the specified range to the end of the SmallVector.
383 template <typename in_iter,
384 typename = std::enable_if_t<std::is_convertible<
385 typename std::iterator_traits<in_iter>::iterator_category,
386 std::input_iterator_tag>::value>>
387 void append(in_iter in_start, in_iter in_end) {
388 size_type NumInputs = std::distance(in_start, in_end);
389 if (NumInputs > this->capacity() - this->size())
390 this->grow(this->size()+NumInputs);
391
392 this->uninitialized_copy(in_start, in_end, this->end());
393 this->set_size(this->size() + NumInputs);
394 }
395
396 /// Append \p NumInputs copies of \p Elt to the end.
397 void append(size_type NumInputs, const T &Elt) {
398 if (NumInputs > this->capacity() - this->size())
399 this->grow(this->size()+NumInputs);
400
401 std::uninitialized_fill_n(this->end(), NumInputs, Elt);
402 this->set_size(this->size() + NumInputs);
403 }
404
405 void append(std::initializer_list<T> IL) {
406 append(IL.begin(), IL.end());
407 }
408
409 // FIXME: Consider assigning over existing elements, rather than clearing &
410 // re-initializing them - for all assign(...) variants.
411
412 void assign(size_type NumElts, const T &Elt) {
413 clear();
414 if (this->capacity() < NumElts)
415 this->grow(NumElts);
416 this->set_size(NumElts);
417 std::uninitialized_fill(this->begin(), this->end(), Elt);
418 }
419
420 template <typename in_iter,
421 typename = std::enable_if_t<std::is_convertible<
422 typename std::iterator_traits<in_iter>::iterator_category,
423 std::input_iterator_tag>::value>>
424 void assign(in_iter in_start, in_iter in_end) {
425 clear();
426 append(in_start, in_end);
427 }
428
429 void assign(std::initializer_list<T> IL) {
430 clear();
431 append(IL);
432 }
433
434 iterator erase(const_iterator CI) {
435 // Just cast away constness because this is a non-const member function.
436 iterator I = const_cast<iterator>(CI);
437
438 assert(I >= this->begin() && "Iterator to erase is out of bounds.")((I >= this->begin() && "Iterator to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Iterator to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 438, __PRETTY_FUNCTION__))
;
439 assert(I < this->end() && "Erasing at past-the-end iterator.")((I < this->end() && "Erasing at past-the-end iterator."
) ? static_cast<void> (0) : __assert_fail ("I < this->end() && \"Erasing at past-the-end iterator.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 439, __PRETTY_FUNCTION__))
;
440
441 iterator N = I;
442 // Shift all elts down one.
443 std::move(I+1, this->end(), I);
444 // Drop the last elt.
445 this->pop_back();
446 return(N);
447 }
448
449 iterator erase(const_iterator CS, const_iterator CE) {
450 // Just cast away constness because this is a non-const member function.
451 iterator S = const_cast<iterator>(CS);
452 iterator E = const_cast<iterator>(CE);
453
454 assert(S >= this->begin() && "Range to erase is out of bounds.")((S >= this->begin() && "Range to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("S >= this->begin() && \"Range to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 454, __PRETTY_FUNCTION__))
;
455 assert(S <= E && "Trying to erase invalid range.")((S <= E && "Trying to erase invalid range.") ? static_cast
<void> (0) : __assert_fail ("S <= E && \"Trying to erase invalid range.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 455, __PRETTY_FUNCTION__))
;
456 assert(E <= this->end() && "Trying to erase past the end.")((E <= this->end() && "Trying to erase past the end."
) ? static_cast<void> (0) : __assert_fail ("E <= this->end() && \"Trying to erase past the end.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 456, __PRETTY_FUNCTION__))
;
457
458 iterator N = S;
459 // Shift all elts down.
460 iterator I = std::move(E, this->end(), S);
461 // Drop the last elts.
462 this->destroy_range(I, this->end());
463 this->set_size(I - this->begin());
464 return(N);
465 }
466
467 iterator insert(iterator I, T &&Elt) {
468 if (I == this->end()) { // Important special case for empty vector.
469 this->push_back(::std::move(Elt));
470 return this->end()-1;
471 }
472
473 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 473, __PRETTY_FUNCTION__))
;
474 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 474, __PRETTY_FUNCTION__))
;
475
476 if (this->size() >= this->capacity()) {
477 size_t EltNo = I-this->begin();
478 this->grow();
479 I = this->begin()+EltNo;
480 }
481
482 ::new ((void*) this->end()) T(::std::move(this->back()));
483 // Push everything else over.
484 std::move_backward(I, this->end()-1, this->end());
485 this->set_size(this->size() + 1);
486
487 // If we just moved the element we're inserting, be sure to update
488 // the reference.
489 T *EltPtr = &Elt;
490 if (I <= EltPtr && EltPtr < this->end())
491 ++EltPtr;
492
493 *I = ::std::move(*EltPtr);
494 return I;
495 }
496
497 iterator insert(iterator I, const T &Elt) {
498 if (I == this->end()) { // Important special case for empty vector.
499 this->push_back(Elt);
500 return this->end()-1;
501 }
502
503 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 503, __PRETTY_FUNCTION__))
;
504 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 504, __PRETTY_FUNCTION__))
;
505
506 if (this->size() >= this->capacity()) {
507 size_t EltNo = I-this->begin();
508 this->grow();
509 I = this->begin()+EltNo;
510 }
511 ::new ((void*) this->end()) T(std::move(this->back()));
512 // Push everything else over.
513 std::move_backward(I, this->end()-1, this->end());
514 this->set_size(this->size() + 1);
515
516 // If we just moved the element we're inserting, be sure to update
517 // the reference.
518 const T *EltPtr = &Elt;
519 if (I <= EltPtr && EltPtr < this->end())
520 ++EltPtr;
521
522 *I = *EltPtr;
523 return I;
524 }
525
526 iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
527 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
528 size_t InsertElt = I - this->begin();
529
530 if (I == this->end()) { // Important special case for empty vector.
531 append(NumToInsert, Elt);
532 return this->begin()+InsertElt;
533 }
534
535 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 535, __PRETTY_FUNCTION__))
;
536 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 536, __PRETTY_FUNCTION__))
;
537
538 // Ensure there is enough space.
539 reserve(this->size() + NumToInsert);
540
541 // Uninvalidate the iterator.
542 I = this->begin()+InsertElt;
543
544 // If there are more elements between the insertion point and the end of the
545 // range than there are being inserted, we can use a simple approach to
546 // insertion. Since we already reserved space, we know that this won't
547 // reallocate the vector.
548 if (size_t(this->end()-I) >= NumToInsert) {
549 T *OldEnd = this->end();
550 append(std::move_iterator<iterator>(this->end() - NumToInsert),
551 std::move_iterator<iterator>(this->end()));
552
553 // Copy the existing elements that get replaced.
554 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
555
556 std::fill_n(I, NumToInsert, Elt);
557 return I;
558 }
559
560 // Otherwise, we're inserting more elements than exist already, and we're
561 // not inserting at the end.
562
563 // Move over the elements that we're about to overwrite.
564 T *OldEnd = this->end();
565 this->set_size(this->size() + NumToInsert);
566 size_t NumOverwritten = OldEnd-I;
567 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
568
569 // Replace the overwritten part.
570 std::fill_n(I, NumOverwritten, Elt);
571
572 // Insert the non-overwritten middle part.
573 std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
574 return I;
575 }
576
577 template <typename ItTy,
578 typename = std::enable_if_t<std::is_convertible<
579 typename std::iterator_traits<ItTy>::iterator_category,
580 std::input_iterator_tag>::value>>
581 iterator insert(iterator I, ItTy From, ItTy To) {
582 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
583 size_t InsertElt = I - this->begin();
584
585 if (I == this->end()) { // Important special case for empty vector.
586 append(From, To);
587 return this->begin()+InsertElt;
588 }
589
590 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 590, __PRETTY_FUNCTION__))
;
591 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200224111112+6e561d1c94e/llvm/include/llvm/ADT/SmallVector.h"
, 591, __PRETTY_FUNCTION__))
;
592
593 size_t NumToInsert = std::distance(From, To);
594
595 // Ensure there is enough space.
596 reserve(this->size() + NumToInsert);
597
598 // Uninvalidate the iterator.
599 I = this->begin()+InsertElt;
600
601 // If there are more elements between the insertion point and the end of the
602 // range than there are being inserted, we can use a simple approach to
603 // insertion. Since we already reserved space, we know that this won't
604 // reallocate the vector.
605 if (size_t(this->end()-I) >= NumToInsert) {
606 T *OldEnd = this->end();
607 append(std::move_iterator<iterator>(this->end() - NumToInsert),
608 std::move_iterator<iterator>(this->end()));
609
610 // Copy the existing elements that get replaced.
611 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
612
613 std::copy(From, To, I);
614 return I;
615 }
616
617 // Otherwise, we're inserting more elements than exist already, and we're
618 // not inserting at the end.
619
620 // Move over the elements that we're about to overwrite.
621 T *OldEnd = this->end();
622 this->set_size(this->size() + NumToInsert);
623 size_t NumOverwritten = OldEnd-I;
624 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
625
626 // Replace the overwritten part.
627 for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
628 *J = *From;
629 ++J; ++From;
630 }
631
632 // Insert the non-overwritten middle part.
633 this->uninitialized_copy(From, To, OldEnd);
634 return I;
635 }
636
637 void insert(iterator I, std::initializer_list<T> IL) {
638 insert(I, IL.begin(), IL.end());
639 }
640
641 template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
642 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
643 this->grow();
644 ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
645 this->set_size(this->size() + 1);
646 return this->back();
647 }
648
649 SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
650
651 SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
652
653 bool operator==(const SmallVectorImpl &RHS) const {
654 if (this->size() != RHS.size()) return false;
655 return std::equal(this->begin(), this->end(), RHS.begin());
656 }
657 bool operator!=(const SmallVectorImpl &RHS) const {
658 return !(*this == RHS);
659 }
660
661 bool operator<(const SmallVectorImpl &RHS) const {
662 return std::lexicographical_compare(this->begin(), this->end(),
663 RHS.begin(), RHS.end());
664 }
665};
666
667template <typename T>
668void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
669 if (this == &RHS) return;
670
671 // We can only avoid copying elements if neither vector is small.
672 if (!this->isSmall() && !RHS.isSmall()) {
673 std::swap(this->BeginX, RHS.BeginX);
674 std::swap(this->Size, RHS.Size);
675 std::swap(this->Capacity, RHS.Capacity);
676 return;
677 }
678 if (RHS.size() > this->capacity())
679 this->grow(RHS.size());
680 if (this->size() > RHS.capacity())
681 RHS.grow(this->size());
682
683 // Swap the shared elements.
684 size_t NumShared = this->size();
685 if (NumShared > RHS.size()) NumShared = RHS.size();
686 for (size_type i = 0; i != NumShared; ++i)
687 std::swap((*this)[i], RHS[i]);
688
689 // Copy over the extra elts.
690 if (this->size() > RHS.size()) {
691 size_t EltDiff = this->size() - RHS.size();
692 this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
693 RHS.set_size(RHS.size() + EltDiff);
694 this->destroy_range(this->begin()+NumShared, this->end());
695 this->set_size(NumShared);
696 } else if (RHS.size() > this->size()) {
697 size_t EltDiff = RHS.size() - this->size();
698 this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
699 this->set_size(this->size() + EltDiff);
700 this->destroy_range(RHS.begin()+NumShared, RHS.end());
701 RHS.set_size(NumShared);
702 }
703}
704
705template <typename T>
706SmallVectorImpl<T> &SmallVectorImpl<T>::
707 operator=(const SmallVectorImpl<T> &RHS) {
708 // Avoid self-assignment.
709 if (this == &RHS) return *this;
710
711 // If we already have sufficient space, assign the common elements, then
712 // destroy any excess.
713 size_t RHSSize = RHS.size();
714 size_t CurSize = this->size();
715 if (CurSize >= RHSSize) {
716 // Assign common elements.
717 iterator NewEnd;
718 if (RHSSize)
719 NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
720 else
721 NewEnd = this->begin();
722
723 // Destroy excess elements.
724 this->destroy_range(NewEnd, this->end());
725
726 // Trim.
727 this->set_size(RHSSize);
728 return *this;
729 }
730
731 // If we have to grow to have enough elements, destroy the current elements.
732 // This allows us to avoid copying them during the grow.
733 // FIXME: don't do this if they're efficiently moveable.
734 if (this->capacity() < RHSSize) {
735 // Destroy current elements.
736 this->destroy_range(this->begin(), this->end());
737 this->set_size(0);
738 CurSize = 0;
739 this->grow(RHSSize);
740 } else if (CurSize) {
741 // Otherwise, use assignment for the already-constructed elements.
742 std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
743 }
744
745 // Copy construct the new elements in place.
746 this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
747 this->begin()+CurSize);
748
749 // Set end.
750 this->set_size(RHSSize);
751 return *this;
752}
753
754template <typename T>
755SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
756 // Avoid self-assignment.
757 if (this == &RHS) return *this;
758
759 // If the RHS isn't small, clear this vector and then steal its buffer.
760 if (!RHS.isSmall()) {
761 this->destroy_range(this->begin(), this->end());
762 if (!this->isSmall()) free(this->begin());
763 this->BeginX = RHS.BeginX;
764 this->Size = RHS.Size;
765 this->Capacity = RHS.Capacity;
766 RHS.resetToSmall();
767 return *this;
768 }
769
770 // If we already have sufficient space, assign the common elements, then
771 // destroy any excess.
772 size_t RHSSize = RHS.size();
773 size_t CurSize = this->size();
774 if (CurSize >= RHSSize) {
775 // Assign common elements.
776 iterator NewEnd = this->begin();
777 if (RHSSize)
778 NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
779
780 // Destroy excess elements and trim the bounds.
781 this->destroy_range(NewEnd, this->end());
782 this->set_size(RHSSize);
783
784 // Clear the RHS.
785 RHS.clear();
786
787 return *this;
788 }
789
790 // If we have to grow to have enough elements, destroy the current elements.
791 // This allows us to avoid copying them during the grow.
792 // FIXME: this may not actually make any sense if we can efficiently move
793 // elements.
794 if (this->capacity() < RHSSize) {
795 // Destroy current elements.
796 this->destroy_range(this->begin(), this->end());
797 this->set_size(0);
798 CurSize = 0;
799 this->grow(RHSSize);
800 } else if (CurSize) {
801 // Otherwise, use assignment for the already-constructed elements.
802 std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
803 }
804
805 // Move-construct the new elements in place.
806 this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
807 this->begin()+CurSize);
808
809 // Set end.
810 this->set_size(RHSSize);
811
812 RHS.clear();
813 return *this;
814}
815
816/// Storage for the SmallVector elements. This is specialized for the N=0 case
817/// to avoid allocating unnecessary storage.
818template <typename T, unsigned N>
819struct SmallVectorStorage {
820 AlignedCharArrayUnion<T> InlineElts[N];
821};
822
823/// We need the storage to be properly aligned even for small-size of 0 so that
824/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
825/// well-defined.
826template <typename T> struct alignas(alignof(T)) SmallVectorStorage<T, 0> {};
827
828/// This is a 'vector' (really, a variable-sized array), optimized
829/// for the case when the array is small. It contains some number of elements
830/// in-place, which allows it to avoid heap allocation when the actual number of
831/// elements is below that threshold. This allows normal "small" cases to be
832/// fast without losing generality for large inputs.
833///
834/// Note that this does not attempt to be exception safe.
835///
836template <typename T, unsigned N>
837class SmallVector : public SmallVectorImpl<T>, SmallVectorStorage<T, N> {
838public:
839 SmallVector() : SmallVectorImpl<T>(N) {}
840
841 ~SmallVector() {
842 // Destroy the constructed elements in the vector.
843 this->destroy_range(this->begin(), this->end());
844 }
845
846 explicit SmallVector(size_t Size, const T &Value = T())
847 : SmallVectorImpl<T>(N) {
848 this->assign(Size, Value);
849 }
850
851 template <typename ItTy,
852 typename = std::enable_if_t<std::is_convertible<
853 typename std::iterator_traits<ItTy>::iterator_category,
854 std::input_iterator_tag>::value>>
855 SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
856 this->append(S, E);
857 }
858
859 template <typename RangeTy>
860 explicit SmallVector(const iterator_range<RangeTy> &R)
861 : SmallVectorImpl<T>(N) {
862 this->append(R.begin(), R.end());
863 }
864
865 SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
866 this->assign(IL);
867 }
868
869 SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
870 if (!RHS.empty())
871 SmallVectorImpl<T>::operator=(RHS);
872 }
873
874 const SmallVector &operator=(const SmallVector &RHS) {
875 SmallVectorImpl<T>::operator=(RHS);
876 return *this;
877 }
878
879 SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
880 if (!RHS.empty())
881 SmallVectorImpl<T>::operator=(::std::move(RHS));
882 }
883
884 SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
885 if (!RHS.empty())
886 SmallVectorImpl<T>::operator=(::std::move(RHS));
887 }
888
889 const SmallVector &operator=(SmallVector &&RHS) {
890 SmallVectorImpl<T>::operator=(::std::move(RHS));
891 return *this;
892 }
893
894 const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
895 SmallVectorImpl<T>::operator=(::std::move(RHS));
896 return *this;
897 }
898
899 const SmallVector &operator=(std::initializer_list<T> IL) {
900 this->assign(IL);
901 return *this;
902 }
903};
904
905template <typename T, unsigned N>
906inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
907 return X.capacity_in_bytes();
908}
909
910/// Given a range of type R, iterate the entire range and return a
911/// SmallVector with elements of the vector. This is useful, for example,
912/// when you want to iterate a range and then sort the results.
913template <unsigned Size, typename R>
914SmallVector<typename std::remove_const<typename std::remove_reference<
915 decltype(*std::begin(std::declval<R &>()))>::type>::type,
916 Size>
917to_vector(R &&Range) {
918 return {std::begin(Range), std::end(Range)};
919}
920
921} // end namespace llvm
922
923namespace std {
924
925 /// Implement std::swap in terms of SmallVector swap.
926 template<typename T>
927 inline void
928 swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
929 LHS.swap(RHS);
930 }
931
932 /// Implement std::swap in terms of SmallVector swap.
933 template<typename T, unsigned N>
934 inline void
935 swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
936 LHS.swap(RHS);
937 }
938
939} // end namespace std
940
941#endif // LLVM_ADT_SMALLVECTOR_H